Skip to content

Commit

Permalink
Added intsimdmatrix as a generic integer matrixdotvector function wit…
Browse files Browse the repository at this point in the history
…h AVX2 and SSE specializations
  • Loading branch information
Ray Smith committed Sep 8, 2017
1 parent ad74e8a commit fc6a390
Show file tree
Hide file tree
Showing 21 changed files with 1,549 additions and 41 deletions.
2 changes: 2 additions & 0 deletions api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ libtesseract_api_la_LIBADD = \
../dict/libtesseract_dict.la \
../arch/libtesseract_arch.la \
../arch/libtesseract_avx.la \
../arch/libtesseract_avx2.la \
../arch/libtesseract_sse.la \
../lstm/libtesseract_lstm.la \
../ccstruct/libtesseract_ccstruct.la \
Expand Down Expand Up @@ -60,6 +61,7 @@ libtesseract_la_LIBADD = \
../dict/libtesseract_dict.la \
../arch/libtesseract_arch.la \
../arch/libtesseract_avx.la \
../arch/libtesseract_avx2.la \
../arch/libtesseract_sse.la \
../lstm/libtesseract_lstm.la \
../ccstruct/libtesseract_ccstruct.la \
Expand Down
18 changes: 12 additions & 6 deletions arch/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
AM_CPPFLAGS += -I$(top_srcdir)/ccstruct -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
AUTOMAKE_OPTIONS = subdir-objects
SUBDIRS =
AM_CXXFLAGS =
Expand All @@ -8,31 +8,37 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
AM_CPPFLAGS += -DTESS_EXPORTS
endif

include_HEADERS = dotproductavx.h dotproductsse.h simddetect.h
include_HEADERS = dotproductavx.h dotproductsse.h intsimdmatrix.h intsimdmatrixavx2.h intsimdmatrixsse.h simddetect.h

noinst_HEADERS =

if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_avx2.la libtesseract_sse.la
noinst_LTLIBRARIES += libtesseract_arch.la
else
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_avx2.la libtesseract_sse.la
lib_LTLIBRARIES += libtesseract_arch.la
libtesseract_arch_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
libtesseract_avx2_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
endif

if AVX_OPT
libtesseract_avx_la_CXXFLAGS = -mavx
endif
if AVX2_OPT
libtesseract_avx2_la_CXXFLAGS = -mavx2
endif
if SSE41_OPT
libtesseract_sse_la_CXXFLAGS = -msse4.1
endif

libtesseract_arch_la_SOURCES = simddetect.cpp
libtesseract_arch_la_SOURCES = intsimdmatrix.cpp simddetect.cpp

libtesseract_avx_la_SOURCES = dotproductavx.cpp

libtesseract_sse_la_SOURCES = dotproductsse.cpp
libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp

libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp

133 changes: 133 additions & 0 deletions arch/intsimdmatrix.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrix.cpp
// Description: Base class for 8-bit int SIMD matrix multipliers.
// Author: Ray Smith
// Created: Tue Aug 15 08:01:32 PST 2017
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#include "intsimdmatrix.h"
#include "intsimdmatrixavx2.h"
#include "intsimdmatrixsse.h"
#include "simddetect.h"

namespace tesseract {

// Factory makes and returns an IntSimdMatrix (sub)class of the best
// available type for the current architecture.
/* static */
IntSimdMatrix* IntSimdMatrix::GetFastestMultiplier() {
IntSimdMatrix* multiplier = nullptr;
if (SIMDDetect::IsAVX2Available()) {
multiplier = new IntSimdMatrixAVX2();
} else if (SIMDDetect::IsSSEAvailable()) {
multiplier = new IntSimdMatrixSSE();
} else {
// Default c++ implementation.
multiplier = new IntSimdMatrix();
}
return multiplier;
}

// Computes a reshaped copy of the weight matrix w. If there are no
// partial_funcs_, it does nothing.
void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w) {
if (partial_funcs_.empty()) return;
int num_out = w.dim1();
int num_in = w.dim2() - 1;
// The rounded-up sizes of the reshaped weight matrix, excluding biases.
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
int rounded_num_out = RoundOutputs(num_out);
// Add the bias and compute the required size.
shaped_w_.resize((rounded_num_in + 1) * rounded_num_out, 0);
int shaped_index = 0;
int output = 0;
// Each number of registers needs a different format! Iterates over the
// different numbers of registers (each a power of 2).
for (int num_registers = max_output_registers_; num_registers >= 1;
num_registers /= 2) {
// The number of outputs that we will generate with this many registers.
int num_outputs_per_register_set =
num_registers * num_outputs_per_register_;
// Use the max number of registers until we have to go fewer.
while (output + num_outputs_per_register_set <= rounded_num_out) {
// Accumulating outputs in registers saves iterating over the inputs, so
// we only have to do it once per output register set.
for (int input = 0; input < num_in; input += num_inputs_per_group_) {
// Iterate over the number of outputs in a register set.
for (int j = 0; j < num_outputs_per_register_set; ++j) {
// Inner-most loop corresponds to the number of inputs in an input
// group.
for (int i = 0; i < num_inputs_per_group_; ++i) {
int8_t weight = 0;
if (output + j < num_out && input + i < num_in)
weight = w(output + j, input + i);
shaped_w_[shaped_index++] = weight;
}
}
}
// Append the bias weights for the register set.
for (int j = 0; j < num_outputs_per_register_set; ++j) {
int8_t weight = 0;
if (output + j < num_out) weight = w(output + j, num_in);
shaped_w_[shaped_index++] = weight;
}
output += num_outputs_per_register_set;
}
}
}

// Computes matrix.vector v = Wu.
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
const GenericVector<double>& scales,
const int8_t* u, double* v) const {
int num_out = w.dim1();
int num_in = w.dim2() - 1;
if (partial_funcs_.empty()) {
// Base implementation.
for (int i = 0; i < num_out; ++i) {
const int8_t* wi = w[i];
int total = 0;
for (int j = 0; j < num_in; ++j) total += wi[j] * u[j];
// Add in the bias and correct for integer values.
v[i] = (static_cast<double>(total) / MAX_INT8 + wi[num_in]) * scales[i];
}
} else {
const int8_t* w_data = shaped_w_.data();
const double* scales_data = &scales[0];
// Each call to a partial_func_ produces group_size outputs, except the
// last one, which can produce less.
int group_size = num_outputs_per_register_ * max_output_registers_;
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
int rounded_num_out = RoundOutputs(num_out);
int output = 0;
for (auto fn : partial_funcs_) {
// The amount of w_data consumed by each call to fn.
int w_step = (rounded_num_in + 1) * group_size;
// Run with this group size, until it would produce too much output, then
// switch to a smaller size.
for (; output + group_size <= rounded_num_out; output += group_size) {
(*fn)(w_data, scales_data, u, rounded_num_in, num_out - output, v);
w_data += w_step;
scales_data += group_size;
v += group_size;
}
group_size /= 2;
}
}
}

} // namespace tesseract
135 changes: 135 additions & 0 deletions arch/intsimdmatrix.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrix.h
// Description: Base class for 8-bit int SIMD matrix multipliers.
// Author: Ray Smith
// Created: Tue Aug 15 07:37:20 PST 2017
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
#define TESSERACT_ARCH_INTSIMDMATRIX_H_

#include <stdint.h>
#include <vector>
#include "genericvector.h"
#include "matrix.h"

namespace tesseract {

// Base class for a SIMD function to multiply a matrix by a vector, with sources
// of 8-bit signed integer, and result in a double, after appropriate scaling.
// Assumes a specific method of multiplication that can be applied to any size
// and number of SIMD registers as follows:
// int32_t results are computed with num_outputs_per_register_ in each of
// max_output_registers_ result registers, repeatedly until it would make too
// many results, then the number of registers is halved, and so-on down to a
// single result register. The last calculation only outputs the required number
// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
// num_outputs_per_register_ = 4, and max_output_registers_ = 8,
// Step 1: 8x4=32 results are computed,
// Step 2: 8x4=32 again, total 64,
// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
// Step 4: 1x3, total 75.
// Each step above is computed using a PartialFunc, which runs over the input
// vector once. The input is read one registerful of num_inputs_per_register_
// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
// Since it is slow (on Intel at least) to horizontally add in a register,
// provision is made to process num_inputs_per_group_ inputs at a time, with
// the group being replicated num_input_groups_ times and multiplied by a
// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
// This is most convenient if num_inputs_per_group_ is 4, and the product
// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
// results in the process, but it doesn't have to be implemented that way.
// The weights are re-ordered by Init() to be used sequentially by the above
// algorithm, followed by the biases, so they can be added at the end.
// The base class computes the base C++ implementation.
// NOTE that, although the subclasses execute on different SIMD hardware, no
// virtual methods are needed, as the constructor sets up everything that
// is required to allow the base class implementation to do all the work.
class IntSimdMatrix {
public:
// Constructor should set the data members to indicate the sizes.
// NOTE: Base constructor public only for test purposes.
IntSimdMatrix()
: num_outputs_per_register_(1),
max_output_registers_(1),
num_inputs_per_register_(1),
num_inputs_per_group_(1),
num_input_groups_(1) {}

// Factory makes and returns an IntSimdMatrix (sub)class of the best
// available type for the current architecture.
static IntSimdMatrix* GetFastestMultiplier();

// Computes a reshaped copy of the weight matrix w. If there are no
// partial_funcs_, it does nothing.
void Init(const GENERIC_2D_ARRAY<int8_t>& w);

// Rounds the size up to a multiple of the input register size (in int8_t).
int RoundInputs(int size) const {
return Roundup(size, num_inputs_per_register_);
}
// Rounds the size up to a multiple of the output register size (in int32_t).
int RoundOutputs(int size) const {
return Roundup(size, num_outputs_per_register_);
}

// Computes matrix.vector v = Wu.
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
// Computes the base C++ implementation, if there are no partial_funcs_.
// NOTE: The size of the input vector (u) must be padded using
// RoundInputs above.
// The input will be over-read to the extent of the padding. There are no
// alignment requirements.
void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
const GenericVector<double>& scales, const int8_t* u,
double* v) const;

protected:
// Function to compute part of a matrix.vector multiplication. The weights
// are in a very specific order (see above) in w, which is multiplied by
// u of length num_in, to produce output v after scaling the integer results
// by the corresponding member of scales.
// The amount of w and scales consumed is fixed and not available to the
// caller. The number of outputs written to v will be at most num_out.
typedef void (*PartialFunc)(const int8_t* w, const double* scales,
const int8_t* u, int num_in, int num_out,
double* v);

// Rounds the input up to a multiple of the given factor.
static int Roundup(int input, int factor) {
return (input + factor - 1) / factor * factor;
}

// Number of 32 bit outputs held in each register.
int num_outputs_per_register_;
// Maximum number of registers that we will use to hold outputs.
int max_output_registers_;
// Number of 8 bit inputs in the inputs register.
int num_inputs_per_register_;
// Number of inputs in each weight group.
int num_inputs_per_group_;
// Number of groups of inputs to be broadcast.
int num_input_groups_;
// The weights matrix reorganized in whatever way suits this instance.
std::vector<int8_t> shaped_w_;
// A series of functions to compute a partial result.
std::vector<PartialFunc> partial_funcs_;
};

} // namespace tesseract

#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_
Loading

2 comments on commit fc6a390

@Shreeshrii
Copy link
Collaborator

@Shreeshrii Shreeshrii commented on fc6a390 Sep 10, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. unittest/gunit.h and unittest/include_gunit.h are same file.
    Remove unittest/gunit.h and replace where it is used by unittest/include_gunit.h.

Update: Created PR #1116 for the above.

  1. lstmeval - refers local path in Ray's user area - change to generic.

 relink_command="(cd /usr/local/google/home/rays/opensrc/git/tesseract/training; { test -z \"\${LIBRARY_PATH+set}\" || unset LIBRARY_PATH || { LIBRARY_PATH=; export LIBRARY_PATH; }; }; { test -z \"\${COMPILER_PATH+set}\" || unset COMPILER_PATH || { COMPILER_PATH=; export COMPILER_PATH; }; }; { test -z \"\${GCC_EXEC_PREFIX+set}\" || unset GCC_EXEC_PREFIX || { GCC_EXEC_PREFIX=; export GCC_EXEC_PREFIX; }; }; { test -z \"\${LD_RUN_PATH+set}\" || unset LD_RUN_PATH || { LD_RUN_PATH=; export LD_RUN_PATH; }; }; { test -z \"\${LD_LIBRARY_PATH+set}\" || unset LD_LIBRARY_PATH || { LD_LIBRARY_PATH=; export LD_LIBRARY_PATH; }; }; PATH=/usr/local/google/home/rays/bin:/usr/lib/google-golang/bin:/usr/local/buildtools/java/jdk/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/google/home/rays/bin; export PATH; g++ -g -O2 -std=c++11 -o \$progdir/\$file combine_lang_model.o  ./.libs/libtesseract_training.a ./.libs/libtesseract_tessopt.a -licui18n -licuuc -licudata ../api/.libs/libtesseract.so -lpthread -fopenmp -Wl,-rpath -Wl,/usr/local/google/home/rays/opensrc/git/tesseract/api/.libs)"
 

@Shreeshrii
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following three shell scripts are marked as temporary ...

combine_lang_model - temporary wrapper script for .libs/combine_lang_model

lstmeval - temporary wrapper script for .libs/lstmeval

lstmtraining - temporary wrapper script for .libs/lstmtraining

Please sign in to comment.