# Character based RNN language model
(c) Deniz Yuret, 2018. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness.

* Objectives: Learn to define and train a character based language model and generate text from it. Minibatch blocks of text. Keep a persistent RNN state between updates. Train a Shakespeare generator and a Julia programmer using the same type of model.
* Prerequisites: [RNN basics](06.rnn.ipynb), minibatch, param, param0, RNN, dropout, train!, Adam, nll

In [1]:
using Pkg
for p in ("Knet","ProgressMeter")
    haskey(Pkg.installed(),p) || Pkg.add(p)
end

## Define the model

In [2]:
using Knet: param, param0, RNN, dropout

In [3]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]

In [4]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))

(l::Linear)(x) = l.w * x .+ l.b

In [5]:
struct CharLM; input; rnn; output; end

CharLM(vocab::Int,input::Int,hidden::Int; o...) = 
    CharLM(Embed(vocab,input), RNN(input,hidden; o...), Linear(hidden,vocab))

function (c::CharLM)(x; pdrop=0, hidden=nothing)
    x = c.input(x)                # (B,T)->(X,B,T)
    x = dropout(x, pdrop)
    x = c.rnn(x, hidden=hidden)   # (H,B,T)
    x = dropout(x, pdrop)
    x = reshape(x, size(x,1), :)  # (H,B*T)
    return c.output(x)            # (V,B*T)
end

In [6]:
# To generate text from trained models
function generate(model,chars,n)
    function sample(y)
        p = Array(exp.(y)); r = rand()*sum(p)
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    x = 1
    h = []
    for i=1:n
        y = model([x], hidden=h)
        x = sample(y)
        print(chars[x])
    end
    println()
end;

In [7]:
# For running experiments
using Knet: train!, Adam; import ProgressMeter
function trainresults(file,model,chars)
    if (print("Train from scratch? ");readline()[1]=='y')
        updates = 0; prog = ProgressMeter.Progress(EPOCHS * length(dtrn))
        callback(J)=(ProgressMeter.update!(prog, updates); (updates += 1) <= prog.n)
        opt = Adam(lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS)
        train!(model, dtrn; callback=callback, optimizer=opt, pdrop=DROPOUT, hidden=[])
        Knet.gc(); Knet.save(file,"model",model,"chars",chars)
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,chars = Knet.load(file,"model","chars")
    end
    return model,chars
end

trainresults (generic function with 1 method)

## The Complete Works of William Shakespeare

In [8]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30
ENV["COLUMNS"]=92;

In [9]:
# Load 'The Complete Works of William Shakespeare'
using Knet: Knet
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [10]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [11]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(192, 20)

In [12]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("256×100 Array{UInt8,2}", "256×100 Array{UInt8,2}")

In [13]:
using Knet: AutoGrad
shakemodel,shakechars = trainresults("shakespeare.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT), chars);

Train from scratch? stdin> n


In [14]:
using Knet: nll
exp(nll(shakemodel,dtst))  # Perplexity

4.175054f0

In [15]:
generate(shakemodel,shakechars,1000)

ment.
ANTIPHAOLUS OF EPHESUS. You know, my lord lords, let the wreshsome birth
  Affligers formering youth be with thy idle
  There is fear'd, all with the force of quarrel ran there.
  Within icellance to you for she could Dircess,
  'If thou drrew'd or nothing the England's rock.
AUGE. Mean! "Where renowned Got speaking part,
     This day of country'st wrantless fellowed carliquis, I
     to well gold on.
  Os, Falstaff England. Bed with onvilers to the knavies
  LUCENTIO. As if thou his breast in't, and 'Jadam,
    Dreadful true, lady; when I apel fair,
    For those help crutly Troyan petilone!
    Some wenches, you may be cock, brother, we know,
    And so indeed no less on the day does
    That needs this- it phyied on walks to thee.  
    But know you undervy, suid there be loose;
    What sleeps I frightly command this fault,
    It were gentlemen.
  POSTARD. Good godem.
  CLOWN. Neither knowing the heads
    Trathed little recondstitancerous print.
    H


## Julia programmer

In [16]:
RNNTYPE = :lstm
BATCHSIZE = 64
SEQLENGTH = 64
INPUTSIZE = 512
VOCABSIZE = 128
HIDDENSIZE = 512
NUMLAYERS = 2
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 10
ENV["COLUMNS"]=92;

In [17]:
# Read julia base library source code
base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia")
text = ""
for (root,dirs,files) in walkdir(base)
    for f in files
        f[end-2:end] == ".jl" || continue
        text *= read(joinpath(root,f), String)
    end
    # println((root,length(files),all(f->contains(f,".jl"),files)))
end
length(text)

9131265

In [18]:
# Find unique chars, sort by frequency, assign integer ids.
charcnt = Dict{Char,Int}()
for c in text; charcnt[c]=1+get(charcnt,c,0); end
chars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)
charid = Dict{Char,Int}()
for i=1:length(chars); charid[chars[i]]=i; end
hcat(chars, map(c->charcnt[c],chars))

3642×2 Array{Any,2}:
 ' '   1971836
 'e'    548012
 't'    477724
 'n'    343215
 'r'    338122
 'i'    329419
 's'    325865
 'a'    316561
 'o'    275999
 '\n'   265652
 'l'    203478
 ','    200306
 ')'    194094
 ⋮            
 'ה'         1
 '🍢'         1
 '𝗾'         1
 '𝔔'         1
 'É'         1
 '𝓟'         1
 '𝚿'         1
 '𝕨'         1
 'ɛ'         1
 'Χ'         1
 '🕙'         1
 'ℚ'         1

In [19]:
# Keep only VOCABSIZE most frequent chars, split into train and test
data = map(c->charid[c], collect(text))
data[data .> VOCABSIZE] .= VOCABSIZE
ntst = 1<<19
tst = data[1:ntst]
trn = data[1+ntst:end]
length.((data,trn,tst))

(9131265, 8606977, 524288)

In [20]:
# Print a sample
r = rand(1:(length(trn)-1000))
println(string(chars[trn[r:r+1000]]...)) 

u
        end
    end

    f = Symbol(umf_nm("free_numeric", Tv, Ti))
    @eval begin
        function ($f)(num::Ptr{Cvoid})
            tmp = [num]
            ccall(($(string(f)), :libumfpack), Cvoid, (Ptr{Cvoid},), tmp)
        end
        function umfpack_free_numeric(lu::UmfpackLU{$Tv,$Ti})
            if lu.numeric == C_NULL return lu end
            ($f)(lu.numeric)
            lu.numeric = C_NULL
            return lu
        end
    end
end

function umfpack_report_symbolic(symb::Ptr{Cvoid}, level::Real)
    old_prl::Float64 = umf_ctrl[UMFPACK_PRL]
    umf_ctrl[UMFPACK_PRL] = Float64(level)
    @isok ccall((:umfpack_dl_report_symbolic, :libumfpack), Int,
                (Ptr{Cvoid}, Ptr{Float64}), symb, umf_ctrl)
    umf_ctrl[UMFPACK_PRL] = old_prl
end

umfpack_report_symbolic(symb::Ptr{Cvoid}) = umfpack_report_symbolic(symb, 4.)

function umfpack_report_symbolic(lu::UmfpackLU, level::Real)
    umfpack_report_symbolic(umfpack_symbolic!(lu).symbolic, level)
end

umfpack_report_

In [21]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(2101, 127)

In [22]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("64×64 Array{Int64,2}", "64×64 Array{Int64,2}")

In [23]:
using Knet: AutoGrad
juliamodel,juliachars = trainresults("juliacharlm.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT),chars);

Train from scratch? stdin> n


In [24]:
using Knet: nll
exp(nll(juliamodel,dtst))  # Perplexity

4.802427f0

In [25]:
generate(juliamodel,juliachars,1000)

 # tolim just precise-charwabbrr characters)

    """
    scale_cmp([summary.safep, select...], inf_end)
    ((i < i + 2) && isnan(y) ? Complex{T}(x)) : min((x, y))
    cf <= sizeof(Int(l)) # 0 bits
end

@testset) TerminalMenu expression 1.
# This file is a part of Julia. License is MIT: https://julialang.org/license

export Uff_method
using .Future
@test diy_fp.struct === Union{Float64, Float64}

# test rounding methods to finition table.
import Libdl
import Libdl
DLt

@test rr.dest[1][1] == DocULeverError(docstrtraigbes2docatests)
test_15995(@repurse_mt),
    # needed for mibccs and/using a debug
finalize(-) # wait to the sdiagonal in bitarization store., and is determined safe
# * Max{N} ... capturely depends and all the documentation of the above in the waiting where f(id) whll binary
complexity remaining of type `T` have checkset cases.
if implicit_read(IOBuffer(), CustomTestSet, DataType, Rebuffer)
    # N15, the } handlee on the cache DNS tha ARC libraries
    @test remotecall_f