# Sequence classification model for IMDB Sentiment Analysis

* Objectives: Learn the structure of the IMDB dataset and train a simple RNN model.
* Prerequisites: RNN models (06.rnn.ipynb), param, GRU, nll, minibatch, accuracy, Adam, train!, Train
* Knet: dir (used by imdb.jl)

In [None]:
using Pkg; haskey(Pkg.installed(),"Knet") || Pkg.add("Knet")

In [None]:
EPOCHS=3          # Number of training epochs
BATCHSIZE=64      # Number of instances in a minibatch
EMBEDSIZE=125     # Word embedding size
NUMHIDDEN=100     # Hidden layer size
MAXLEN=150        # maximum size of the word sequence, pad shorter sequences, truncate longer ones
VOCABSIZE=30000   # maximum vocabulary size, keep the most frequent 30K, map the rest to UNK token
NUMCLASS=2        # number of output classes
DROPOUT=0.2       # Dropout rate
LR=0.001          # Learning rate
BETA_1=0.9        # Adam optimization parameter
BETA_2=0.999      # Adam optimization parameter
EPS=1e-08         # Adam optimization parameter

## Load and view data

In [None]:
using Knet: Knet
ENV["COLUMNS"]=92                     # column width for array printing
include(Knet.dir("data","imdb.jl"))   # defines imdb loader

In [None]:
@doc imdb

In [None]:
@time (xtrn,ytrn,xtst,ytst,imdbdict)=imdb(maxlen=MAXLEN,maxval=VOCABSIZE);

In [None]:
summary.((xtrn,ytrn,xtst,ytst,imdbdict))

In [None]:
# Words are encoded with integers
rand(xtrn)'

In [None]:
# Each word sequence is padded or truncated to length 150
length.(xtrn)'

In [None]:
# Define a function that can print the actual words:
imdbvocab = Array{String}(undef,length(imdbdict))
for (k,v) in imdbdict; imdbvocab[v]=k; end
imdbvocab[VOCABSIZE-2:VOCABSIZE] = ["<unk>","<s>","<pad>"]
printwords(x) = println(join(imdbvocab[x],' '))

In [None]:
# Hit shift-Enter to see random reviews:
printwords(rand(xtrn))

In [None]:
# Here are the labels: 1=negative, 2=positive
ytrn'

## Define the model

In [None]:
using Knet: param, dropout, RNN

In [None]:
struct SequenceClassifier; input; rnn; output; end

In [None]:
SequenceClassifier(input::Int, embed::Int, hidden::Int, output::Int) =
    SequenceClassifier(param(embed,input), RNN(embed,hidden,rnnType=:gru), param(output,hidden))

In [None]:
function (sc::SequenceClassifier)(input; pdrop=0)
    embed = sc.input[:, permutedims(hcat(input...))]
    embed = dropout(embed,pdrop)
    hidden = sc.rnn(embed)
    hidden = dropout(hidden,pdrop)
    return sc.output * hidden[:,:,end]
end

In [None]:
model = SequenceClassifier(VOCABSIZE,EMBEDSIZE,NUMHIDDEN,NUMCLASS)

In [None]:
using Knet: minibatch
dtrn = minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)
dtst = minibatch(xtst,ytst,BATCHSIZE)
(x,y) = first(dtrn)
model(x)

In [None]:
using Knet: nll, accuracy
nll(model,dtrn), nll(model,dtst), accuracy(model,dtrn), accuracy(model,dtst)

In [None]:
using Knet: Adam, train!, Train
opt = Adam(lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS)
ntrn = length(dtrn)
@time train!(model, dtrn; optimizer=opt, callback=Train(0:ntrn÷5:EPOCHS*ntrn), pdrop=DROPOUT)

In [None]:
# 33s (0.059155148f0, 0.3877507f0, 0.9846153846153847, 0.8583733974358975)
nll(model,dtrn), nll(model,dtst), accuracy(model,dtrn), accuracy(model,dtst)

In [None]:
Knet.gc()