In [1]:
using Flux
using Flux: onehot, onehotbatch, crossentropy, reset!, throttle
using Statistics: mean
using Random
using Unicode

In [2]:
corpora = Dict()

cd(@__DIR__)
for file in readdir("corpus")
    lang = Symbol(match(r"(.*)\.txt", file).captures[1])
    corpus = split(String(read("corpus/$file")), ".")
    corpus = strip.(Unicode.normalize.(corpus, casefold=true, stripmark=true))
    corpus = filter(!isempty, corpus)
    corpora[lang] = corpus
end

langs = collect(keys(corpora))
alphabet = ['a':'z'; '0':'9'; ' '; '\n'; '_']

# See which chars will be represented as "unknown"
unique(filter(x -> x ∉ alphabet, join(vcat(values(corpora)...))))

150-element Array{Char,1}:
 '(' 
 '/' 
 'ˌ' 
 'ɪ' 
 'ˈ' 
 'ː' 
 'ə' 
 ')' 
 ',' 
 '-' 
 '[' 
 ']' 
 '\''
 ⋮   
 'ব' 
 'ল' 
 'দ' 
 'শ' 
 'চ' 
 'ট' 
 'ম' 
 'ঢ' 
 'ক' 
 'খ' 
 'হ' 
 'স' 

In [3]:
dataset = [(onehotbatch(s, alphabet, '_'), onehot(l, langs))
           for l in langs for s in corpora[l]] |> shuffle

train, test = dataset[1:end-100], dataset[end-99:end]

(Tuple{Flux.OneHotMatrix{Array{Flux.OneHotVector,1}},Flux.OneHotVector}[([false false … false false; false false … false false; … ; false false … false false; false false … false true], [false, false, true, false, false]), ([false true … false false; false false … false false; … ; false false … false false; false false … false false], [false, false, false, true, false]), ([false false … false false; false false … false false; … ; false false … false false; true false … false false], [false, false, false, false, true]), ([false false … false true; false false … false false; … ; false false … false false; false false … false false], [false, false, false, true, false]), ([false false … false false; false false … false false; … ; false false … false false; true true … false false], [true, false, false, false, false]), ([false false … false false; false false … false false; … ; false false … false false; true false … false false], [false, false, false, true, false]), ([false false … false f

In [4]:
N = 15

scanner = Chain(Dense(length(alphabet), N, σ), LSTM(N, N))
encoder = Dense(N, length(langs))

function model(x)
    state = scanner.(x.data)[end]
    reset!(scanner)
    softmax(encoder(state))
end

loss(x, y) = crossentropy(model(x), y)

testloss() = mean(loss(t...) for t in test)

opt = ADAM()
ps = params(scanner, encoder)
evalcb = () -> @show testloss()

#11 (generic function with 1 method)

In [6]:
Flux.train!(loss, ps, train, opt, cb = throttle(evalcb, 10))

testloss() = 1.5330142f0 (tracked)
testloss() = 1.4882953f0 (tracked)
testloss() = 1.2766123f0 (tracked)
testloss() = 1.3358175f0 (tracked)
testloss() = 1.2766131f0 (tracked)
testloss() = 1.2379081f0 (tracked)
testloss() = 1.2043288f0 (tracked)
testloss() = 1.3020862f0 (tracked)
testloss() = 1.3938646f0 (tracked)
testloss() = 1.3144602f0 (tracked)
testloss() = 1.2121319f0 (tracked)
testloss() = 1.1983653f0 (tracked)
testloss() = 1.3928201f0 (tracked)
testloss() = 1.1769012f0 (tracked)
testloss() = 1.5113814f0 (tracked)
testloss() = 1.1120145f0 (tracked)
testloss() = 1.2551943f0 (tracked)
testloss() = 1.2663445f0 (tracked)
testloss() = 1.0412004f0 (tracked)
testloss() = 1.2133237f0 (tracked)
testloss() = 1.0968978f0 (tracked)
testloss() = 1.1209707f0 (tracked)
testloss() = 1.1489831f0 (tracked)
