RNN + LSTM Layers #3948

Merged
merged 3 commits into from Jun 2, 2016
@@ -0,0 +1,154 @@
+#ifndef CAFFE_LSTM_LAYER_HPP_
+#define CAFFE_LSTM_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ * [1] style recurrent neural network (RNN). Implemented by unrolling
+ * the LSTM computation through time.
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ * i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ * f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ * o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ * g_t := \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ * c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ * h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ * Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ * arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ * arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+ explicit LSTMLayer(const LayerParameter& param)
+ : RecurrentLayer<Dtype>(param) {}
+
+ virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+ virtual void FillUnrolledNet(NetParameter* net_param) const;
+ virtual void RecurrentInputBlobNames(vector<string>* names) const;
+ virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+ virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+ virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ * non-linearity of the LSTM, producing the updated cell and hidden
+ * states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+ explicit LSTMUnitLayer(const LayerParameter& param)
+ : Layer<Dtype>(param) {}
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+
+ virtual inline const char* type() const { return "LSTMUnit"; }
+ virtual inline int ExactNumBottomBlobs() const { return 3; }
+ virtual inline int ExactNumTopBlobs() const { return 2; }
+
+ virtual inline bool AllowForceBackward(const int bottom_index) const {
+ // Can't propagate to sequence continuation indicators.
+ return bottom_index != 2;
+ }
+
+ protected:
+ /**
+ * @param bottom input Blob vector (length 3)
+ * -# @f$ (1 \times N \times D) @f$
+ * the previous timestep cell state @f$ c_{t-1} @f$
+ * -# @f$ (1 \times N \times 4D) @f$
+ * the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+ * -# @f$ (1 \times N) @f$
+ * the sequence continuation indicators @f$ \delta_t @f$
+ * @param top output Blob vector (length 2)
+ * -# @f$ (1 \times N \times D) @f$
+ * the updated cell state @f$ c_t @f$, computed as:
+ * i_t := \sigmoid[i_t']
+ * f_t := \sigmoid[f_t']
+ * o_t := \sigmoid[o_t']
+ * g_t := \tanh[g_t']
+ * c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+ * -# @f$ (1 \times N \times D) @f$
+ * the updated hidden state @f$ h_t @f$, computed as:
+ * h_t := o_t .* \tanh[c_t]
+ */
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+
+ /**
+ * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+ *
+ * @param top output Blob vector (length 2), providing the error gradient with
+ * respect to the outputs
+ * -# @f$ (1 \times N \times D) @f$:
+ * containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+ * with respect to the updated cell state @f$ c_t @f$
+ * -# @f$ (1 \times N \times D) @f$:
+ * containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+ * with respect to the updated cell state @f$ h_t @f$
+ * @param propagate_down see Layer::Backward.
+ * @param bottom input Blob vector (length 3), into which the error gradients
+ * with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+ * inputs are computed. Computatation of the error gradients w.r.t.
+ * the sequence indicators is not implemented.
+ * -# @f$ (1 \times N \times D) @f$
+ * the error gradient w.r.t. the previous timestep cell state
+ * @f$ c_{t-1} @f$
+ * -# @f$ (1 \times N \times 4D) @f$
+ * the error gradient w.r.t. the "gate inputs"
+ * @f$ [
+ * \frac{\partial E}{\partial i_t}
+ * \frac{\partial E}{\partial f_t}
+ * \frac{\partial E}{\partial o_t}
+ * \frac{\partial E}{\partial g_t}
+ * ] @f$
+ * -# @f$ (1 \times 1 \times N) @f$
+ * the gradient w.r.t. the sequence continuation indicators
+ * @f$ \delta_t @f$ is currently not computed.
+ */
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ /// @brief The hidden and output dimension.
+ int hidden_dim_;
+ Blob<Dtype> X_acts_;
+};
+
+} // namespace caffe
+
+#endif // CAFFE_LSTM_LAYER_HPP_
@@ -0,0 +1,187 @@
+#ifndef CAFFE_RECURRENT_LAYER_HPP_
+#define CAFFE_RECURRENT_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/format.hpp"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ * unrolled network. This Layer type cannot be instantiated -- instead,
+ * you should use one of its implementations which defines the recurrent
+ * architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+ explicit RecurrentLayer(const LayerParameter& param)
+ : Layer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reset();
+
+ virtual inline const char* type() const { return "Recurrent"; }
+ virtual inline int MinBottomBlobs() const {
+ int min_bottoms = 2;
+ if (this->layer_param_.recurrent_param().expose_hidden()) {
+ vector<string> inputs;
+ this->RecurrentInputBlobNames(&inputs);
+ min_bottoms += inputs.size();
+ }
+ return min_bottoms;
+ }
+ virtual inline int MaxBottomBlobs() const { return MinBottomBlobs() + 1; }
+ virtual inline int ExactNumTopBlobs() const {
+ int num_tops = 1;
+ if (this->layer_param_.recurrent_param().expose_hidden()) {
+ vector<string> outputs;
+ this->RecurrentOutputBlobNames(&outputs);
+ num_tops += outputs.size();
+ }
+ return num_tops;
+ }
+
+ virtual inline bool AllowForceBackward(const int bottom_index) const {
+ // Can't propagate to sequence continuation indicators.
+ return bottom_index != 1;
+ }
+
+ protected:
+ /**
+ * @brief Fills net_param with the recurrent network architecture. Subclasses
+ * should define this -- see RNNLayer and LSTMLayer for examples.
+ */
+ virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+ /**
+ * @brief Fills names with the names of the 0th timestep recurrent input
+ * Blob&s. Subclasses should define this -- see RNNLayer and LSTMLayer
+ * for examples.
+ */
+ virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+ /**
+ * @brief Fills shapes with the shapes of the recurrent input Blob&s.
+ * Subclasses should define this -- see RNNLayer and LSTMLayer
+ * for examples.
+ */
+ virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
+
+ /**
+ * @brief Fills names with the names of the Tth timestep recurrent output
+ * Blob&s. Subclasses should define this -- see RNNLayer and LSTMLayer
+ * for examples.
+ */
+ virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+ /**
+ * @brief Fills names with the names of the output blobs, concatenated across
+ * all timesteps. Should return a name for each top Blob.
+ * Subclasses should define this -- see RNNLayer and LSTMLayer for
+ * examples.
+ */
+ virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+ /**
+ * @param bottom input Blob vector (length 2-3)
+ *
+ * -# @f$ (T \times N \times ...) @f$
+ * the time-varying input @f$ x @f$. After the first two axes, whose
+ * dimensions must correspond to the number of timesteps @f$ T @f$ and
+ * the number of independent streams @f$ N @f$, respectively, its
+ * dimensions may be arbitrary. Note that the ordering of dimensions --
+ * @f$ (T \times N \times ...) @f$, rather than
+ * @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+ * independent input streams must be "interleaved".
+ *
+ * -# @f$ (T \times N) @f$
+ * the sequence continuation indicators @f$ \delta @f$.
+ * These inputs should be binary (0 or 1) indicators, where
+ * @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+ * @f$ n @f$ is the beginning of a new sequence, and hence the previous
+ * hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+ * and has no effect on the cell's output at timestep @f$ t @f$, and
+ * a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+ * stream @f$ n @f$ is a continuation from the previous timestep
+ * @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+ * updated hidden state and output.
+ *
+ * -# @f$ (N \times ...) @f$ (optional)
+ * the static (non-time-varying) input @f$ x_{static} @f$.
+ * After the first axis, whose dimension must be the number of
+ * independent streams, its dimensions may be arbitrary.
+ * This is mathematically equivalent to using a time-varying input of
+ * @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+ * across the @f$ T @f$ timesteps and concatenating with the time-varying
+ * input. Note that if this input is used, all timesteps in a single
+ * batch within a particular one of the @f$ N @f$ streams must share the
+ * same static input, even if the sequence continuation indicators
+ * suggest that difference sequences are ending and beginning within a
+ * single batch. This may require padding and/or truncation for uniform
+ * length.
+ *
+ * @param top output Blob vector (length 1)
+ * -# @f$ (T \times N \times D) @f$
+ * the time-varying output @f$ y @f$, where @f$ D @f$ is
+ * <code>recurrent_param.num_output()</code>.
+ * Refer to documentation for particular RecurrentLayer implementations
+ * (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+ */
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ /// @brief A Net to implement the Recurrent functionality.
+ shared_ptr<Net<Dtype> > unrolled_net_;
+
+ /// @brief The number of independent streams to process simultaneously.
+ int N_;
+
+ /**
+ * @brief The number of timesteps in the layer's input, and the number of
+ * timesteps over which to backpropagate through time.
+ */
+ int T_;
+
+ /// @brief Whether the layer has a "static" input copied across all timesteps.
+ bool static_input_;
+
+ /**
+ * @brief The last layer to run in the network. (Any later layers are losses
+ * added to force the recurrent net to do backprop.)
+ */
+ int last_layer_index_;
+
+ /**
+ * @brief Whether the layer's hidden state at the first and last timesteps
+ * are layer inputs and outputs, respectively.
+ */
+ bool expose_hidden_;
+
+ vector<Blob<Dtype>* > recur_input_blobs_;
+ vector<Blob<Dtype>* > recur_output_blobs_;
+ vector<Blob<Dtype>* > output_blobs_;
+ Blob<Dtype>* x_input_blob_;
+ Blob<Dtype>* x_static_input_blob_;
+ Blob<Dtype>* cont_input_blob_;
+};
+
+} // namespace caffe
+
+#endif // CAFFE_RECURRENT_LAYER_HPP_
@@ -0,0 +1,47 @@
+#ifndef CAFFE_RNN_LAYER_HPP_
+#define CAFFE_RNN_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ * (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ * h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ * o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template <typename Dtype>
+class RNNLayer : public RecurrentLayer<Dtype> {
+ public:
+ explicit RNNLayer(const LayerParameter& param)
+ : RecurrentLayer<Dtype>(param) {}
+
+ virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+ virtual void FillUnrolledNet(NetParameter* net_param) const;
+ virtual void RecurrentInputBlobNames(vector<string>* names) const;
+ virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+ virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+ virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+} // namespace caffe
+
+#endif // CAFFE_RNN_LAYER_HPP_
Oops, something went wrong.