In [1]:
require 'torch'
require 'nn'
require 'optim'

In [2]:
-- TODO: load actual word vectors
-- first dimension: 150 examples
-- second dimension: 200 words OR 8 labels
-- third dimension: word vectors of dimensionality 300
train_inputs = torch.randn(150, 200, 300)
train_targets = torch.randn(150, 8)

## Data
Input format is a matrix $M \in \mathbb{R}^{n} \times k$
with:
* $n$ number of words in the document
* $k$ the dimension of the word vectors obtained from word2vec
* the data is zero-padded to the length of the longest document
(alternatively to a given maximum length). All not-taken word spaces
are set to zero in the matrix.

## Hyperparameters
* Filter windows of height $h$ are applied in sizes of $3,4,5$ with TODO feature
maps. Specifically the filters are applied as a matrix $W \in \mathbb{R}^{h \times k}$
to a window of size $h \times k$.
* The stride of the filters is $1$ to iterate over all possible windows of words
(*narrow convolution*).
* A dropout of rate $\rho = .5$ is applied at training time to prevent overfitting.
* An $l_2$ constraint $s = 3$ is applied in (Kim, 2014). However (Zhang and Wallace, 2015)
found that this constraint had little effect on the performance of the network.

In [3]:
-- metadata
batch_size = 100 -- ?
no_examples = train_inputs:size()[1]
no_classes = train_targets:size()[2]
sent_len = 150 -- TODO
wordvec_len = 300
dropout_rho = .5
l2_constr = 3

In [4]:
-- we have to copy the matrix 6 times to use the parallel container
-- also we squash the words to a given length, if not already done so.
-- zero padding is applied as well.
X = torch.Tensor(6, no_examples, sent_len, wordvec_len):zero()
y = train_targets

-- FIXME: this is way too complex and memory-inefficient. should use storage
-- and stuff in the future.

for i = 1, no_examples do
    for j = 1, sent_len do
        X[{1, i, j}] = train_inputs[i][j]
        X[{2, i, j}] = train_inputs[i][j]
        X[{3, i, j}] = train_inputs[i][j]
        X[{4, i, j}] = train_inputs[i][j]
        X[{5, i, j}] = train_inputs[i][j]
        X[{6, i, j}] = train_inputs[i][j]
    end
end

## Model

In [17]:
model = nn.Sequential()

model:add(nn.Padding())

-- stage 1: convolutions

-- in: applies the following declared layers to the first dimension of the
-- tensor (see above.)
--
-- out: concatenating the three concatenated max-pooled values to a vector
-- fed to the fully connected softmax layer yielding the outputs
p = nn.Parallel(1,1)

-- this is a convolution unit differing in the height of the filter being
-- applied. each filter is used double time to further improve performance.
-- each filter yields a feature map, thus for each region size we then
-- have two feature maps, and in total then six (if using default n-grams
-- 3 to 5 like (Kim, 2014))
for i = 3, 5 do
    -- for the two convolutions we use for each region size
    local f = nn.Parallel(1,1)

    -- elements of the convolution
    local s1 = nn.Sequential()
    local s2 = nn.Sequential()

    -- takes size of input plane (we only have one channel though)
    -- as well as output plane (again, using only one channel)
    -- and also the kernel width and height. in our case the width is fixed
    -- to a row in the input matrix for the document. the height however
    -- varies and ranges from 3..5
    s1:add(nn.SpatialConvolution(1, 1, sent_len, i))
    s2:add(nn.SpatialConvolution(1, 1, sent_len, i))

    -- non-linearities
    s1:add(nn.ReLU())
    s2:add(nn.ReLU())

    -- the viewed region of the matrix for max-pooling shall be the
    -- size of the matrix, as we want all values to be considered at
    -- once for a single maximum for each filter map.
    s1:add(nn.SpatialMaxPooling(sent_len, i))
    s2:add(nn.SpatialMaxPooling(sent_len, i))

    f:add(s1)
    f:add(s2)

    -- concatenating the two max-pooled values to a tensor of dim TODO
    f:add(nn.Concat(1))

    p:add(f)
end

model:add(p)

-- stage 2: fully connected softmax layer
model:add(nn.Normalize(2, l2_constr))
model:add(nn.Dropout(dropout_rho))
-- model.add(nn.Linear(6, no_classes))

-- model:add(nn.LogSoftMax()) -- for ClassNLLCriterion
model:add(nn.SoftMax())

## Training
The network is trained with Stochastic Gradient Descent (SGD)
with randomly shuffled mini-batches and the Adadelta update rule (Zeiler, 2012).

In [13]:
next_batch = function(offset)
    end_offset = offset + batch_size
    
    if end_offset > X:size()[2] then
        end_offset = offset + (X:size()[2] - offset)
    end

    return
        X[{{}, {offset, end_offset}, {}, {}}]
        , y[{{offset, end_offset}, {}}]
end

In [18]:
for i = 1, no_examples, batch_size do
    inputs, outputs = next_batch(i)
    
    local feval = function(x)
        parameters:copy(x)
        grad_parameters:zero()
        
        local f = 0
        for i = 1, #inputs do
            local output = model:forward(inputs[i])
            local err = criterion:forward(output, targets[i])
            f = f + err
            local df_do = criterion:backward(output, targets[i])
            model:backwards(inputs[i], df_do)
        end
        
        grad_parameters:div(#inputs)
        f = f / #inputs
        return f, grad_parameters
    end
    
    parameters, grad_parameters = model:getParameters()

    optim:adadelta(feval, parameters) --, optim_state)
end

[string "for i = 1, no_examples, batch_size do..."]:2: attempt to call global 'next_batch' (a nil value)
stack traceback:
	[string "for i = 1, no_examples, batch_size do..."]:2: in main chunk
	[C]: in function 'xpcall'
	/Users/nexus/torch/install/share/lua/5.1/itorch/main.lua:179: in function </Users/nexus/torch/install/share/lua/5.1/itorch/main.lua:143>
	/Users/nexus/torch/install/share/lua/5.1/lzmq/poller.lua:75: in function 'poll'
	/Users/nexus/torch/install/share/lua/5.1/lzmq/impl/loop.lua:307: in function 'poll'
	/Users/nexus/torch/install/share/lua/5.1/lzmq/impl/loop.lua:325: in function 'sleep_ex'
	/Users/nexus/torch/install/share/lua/5.1/lzmq/impl/loop.lua:370: in function 'start'
	/Users/nexus/torch/install/share/lua/5.1/itorch/main.lua:350: in main chunk
	[C]: in function 'require'
	[string "arg={'/Users/nexus/.ipython/profile_default/s..."]:1: in main chunk: 

## Sources
* **(Kim, 2014):** Convolutional Neural Networks for Sentence Classification by Yoon Kim
* **(Zhang and Wallace, 2015):** A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification by Ye Zhang, Byron Wallace