/
intsimdmatrix.cpp
133 lines (127 loc) · 5.43 KB
/
intsimdmatrix.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrix.cpp
// Description: Base class for 8-bit int SIMD matrix multipliers.
// Author: Ray Smith
// Created: Tue Aug 15 08:01:32 PST 2017
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#include "intsimdmatrix.h"
#include "intsimdmatrixavx2.h"
#include "intsimdmatrixsse.h"
#include "simddetect.h"
namespace tesseract {
// Factory makes and returns an IntSimdMatrix (sub)class of the best
// available type for the current architecture.
/* static */
IntSimdMatrix* IntSimdMatrix::GetFastestMultiplier() {
IntSimdMatrix* multiplier = nullptr;
if (SIMDDetect::IsAVX2Available()) {
multiplier = new IntSimdMatrixAVX2();
} else if (SIMDDetect::IsSSEAvailable()) {
multiplier = new IntSimdMatrixSSE();
} else {
// Default c++ implementation.
multiplier = new IntSimdMatrix();
}
return multiplier;
}
// Computes a reshaped copy of the weight matrix w. If there are no
// partial_funcs_, it does nothing.
void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w) {
if (partial_funcs_.empty()) return;
int num_out = w.dim1();
int num_in = w.dim2() - 1;
// The rounded-up sizes of the reshaped weight matrix, excluding biases.
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
int rounded_num_out = RoundOutputs(num_out);
// Add the bias and compute the required size.
shaped_w_.resize((rounded_num_in + 1) * rounded_num_out, 0);
int shaped_index = 0;
int output = 0;
// Each number of registers needs a different format! Iterates over the
// different numbers of registers (each a power of 2).
for (int num_registers = max_output_registers_; num_registers >= 1;
num_registers /= 2) {
// The number of outputs that we will generate with this many registers.
int num_outputs_per_register_set =
num_registers * num_outputs_per_register_;
// Use the max number of registers until we have to go fewer.
while (output + num_outputs_per_register_set <= rounded_num_out) {
// Accumulating outputs in registers saves iterating over the inputs, so
// we only have to do it once per output register set.
for (int input = 0; input < num_in; input += num_inputs_per_group_) {
// Iterate over the number of outputs in a register set.
for (int j = 0; j < num_outputs_per_register_set; ++j) {
// Inner-most loop corresponds to the number of inputs in an input
// group.
for (int i = 0; i < num_inputs_per_group_; ++i) {
int8_t weight = 0;
if (output + j < num_out && input + i < num_in)
weight = w(output + j, input + i);
shaped_w_[shaped_index++] = weight;
}
}
}
// Append the bias weights for the register set.
for (int j = 0; j < num_outputs_per_register_set; ++j) {
int8_t weight = 0;
if (output + j < num_out) weight = w(output + j, num_in);
shaped_w_[shaped_index++] = weight;
}
output += num_outputs_per_register_set;
}
}
}
// Computes matrix.vector v = Wu.
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
const GenericVector<double>& scales,
const int8_t* u, double* v) const {
int num_out = w.dim1();
int num_in = w.dim2() - 1;
if (partial_funcs_.empty()) {
// Base implementation.
for (int i = 0; i < num_out; ++i) {
const int8_t* wi = w[i];
int total = 0;
for (int j = 0; j < num_in; ++j) total += wi[j] * u[j];
// Add in the bias and correct for integer values.
v[i] = (static_cast<double>(total) / MAX_INT8 + wi[num_in]) * scales[i];
}
} else {
const int8_t* w_data = shaped_w_.data();
const double* scales_data = &scales[0];
// Each call to a partial_func_ produces group_size outputs, except the
// last one, which can produce less.
int group_size = num_outputs_per_register_ * max_output_registers_;
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
int rounded_num_out = RoundOutputs(num_out);
int output = 0;
for (auto fn : partial_funcs_) {
// The amount of w_data consumed by each call to fn.
int w_step = (rounded_num_in + 1) * group_size;
// Run with this group size, until it would produce too much output, then
// switch to a smaller size.
for (; output + group_size <= rounded_num_out; output += group_size) {
(*fn)(w_data, scales_data, u, rounded_num_in, num_out - output, v);
w_data += w_step;
scales_data += group_size;
v += group_size;
}
group_size /= 2;
}
}
}
} // namespace tesseract