From a18a0072b847f201595afcd8ed11bf86919e64c5 Mon Sep 17 00:00:00 2001
From: Lucas Ondel <lucas.ondel@gmail.com>
Date: Tue, 3 Aug 2021 10:26:36 +0200
Subject: [PATCH 1/6] changed fsm default printing

---
 src/fsm.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/fsm.jl b/src/fsm.jl
index e0ded33..30ba3fe 100644
--- a/src/fsm.jl
+++ b/src/fsm.jl
@@ -56,7 +56,13 @@ function link!(fsm::FSM{T}, src::State{T}, dest::State{T}, weight::T = one(T)) w
     link
 end
 
-function Base.show(io, ::MIME"image/svg+xml", fsm::FSM)
+function Base.show(io::IO, fsm::FSM)
+    nstates = length(fsm.states)
+    nlinks = sum(length, values(fsm.links))
+    print(io, "$(typeof(fsm)) # states: $nstates # links: $nlinks")
+end
+
+function Base.show(io::IO, ::MIME"image/svg+xml", fsm::FSM)
     dotpath, dotfile = mktemp()
     svgpath, svgfile = mktemp()
 

From dd959e07c7d176956cc4f153f79c849066db0e11 Mon Sep 17 00:00:00 2001
From: Lucas Ondel <lucas.ondel@gmail.com>
Date: Tue, 3 Aug 2021 10:27:47 +0200
Subject: [PATCH 2/6] remove unnecessary promote rule

---
 src/semifields.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/semifields.jl b/src/semifields.jl
index 8c62d02..c5ea9f5 100644
--- a/src/semifields.jl
+++ b/src/semifields.jl
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: MIT
 
-# Stable implemenation of the log(exp(x) + exp(y)).
+# Stable implementation of the log(exp(x) + exp(y)).
 function logaddexp(x::T, y::T) where T
     diff = zero(T)
     if x < y
@@ -20,7 +20,7 @@ abstract type Semifield <: Number end
 
 Base.convert(T::Type{<:Number}, x::Semifield) = T(x.val)
 Base.promote(x::SF, y::Real) where SF <: Semifield = x, SF(y)
-Base.promote(x::Real, y::SF) where SF <: Semifield = SF(x), y
+#Base.promote(x::Real, y::SF) where SF <: Semifield = SF(x), y
 Base.show(io::IO, x::Semifield) = print(io, x.val)
 
 #======================================================================

From 9658600e3db555ef79b6bbe58e5d109f6bd0dfde Mon Sep 17 00:00:00 2001
From: Lucas Ondel <lucas.ondel@gmail.com>
Date: Tue, 3 Aug 2021 10:49:43 +0200
Subject: [PATCH 3/6] renamed 'link' to 'arc'.

---
 src/MarkovModels.jl |  2 +-
 src/cfsm.jl         |  6 ++--
 src/fsm.jl          | 75 +++++++++++++++++++++++----------------------
 test/runtests.jl    |  6 ++--
 4 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/src/MarkovModels.jl b/src/MarkovModels.jl
index fdbc2b5..b1c9734 100644
--- a/src/MarkovModels.jl
+++ b/src/MarkovModels.jl
@@ -32,7 +32,7 @@ include("fsm.jl")
 export FSM
 export addstate!
 export determinize
-export link!
+export addarc!
 export minimize
 export renormalize!
 export setinit!
diff --git a/src/cfsm.jl b/src/cfsm.jl
index 0e5298e..fc1bc3d 100644
--- a/src/cfsm.jl
+++ b/src/cfsm.jl
@@ -43,9 +43,9 @@ function compile(fsm::FSM{T}; allocator = spzeros) where T
     A = allocator(T, S, S)
     Aᵀ = allocator(T, S, S)
     for src in allstates
-        for link in links(fsm, src)
-            A[src.id, link.dest.id] = link.weight
-            Aᵀ[link.dest.id, src.id] = link.weight
+        for arc in arcs(fsm, src)
+            A[src.id, arc.dest.id] = arc.weight
+            Aᵀ[arc.dest.id, src.id] = arc.weight
         end
     end
 
diff --git a/src/fsm.jl b/src/fsm.jl
index 30ba3fe..960dae7 100644
--- a/src/fsm.jl
+++ b/src/fsm.jl
@@ -18,7 +18,7 @@ isemitting(s::State)  = ! isnothing(s.pdfindex)
 setinit!(s::State{T}, weight::T = one(T)) where T = s.initweight = weight
 setfinal!(s::State{T}, weight::T = one(T)) where T = s.finalweight = weight
 
-mutable struct Link{T<:Semifield}
+mutable struct Arc{T<:Semifield}
     dest::State
     weight::T
 end
@@ -26,20 +26,21 @@ end
 """
     struct FSM{T<:Semifield}
         states # vector of states
-        links # Dict state -> vector of links
+        arcs # Dict state -> vector of arcs
     end
 
 Probabilistic finite state machine.
 """
 struct FSM{T<:Semifield}
     states::Vector{State{T}}
-    links::Dict{State, Vector{Link{T}}}
+    arcs::Dict{State, Vector{Arc{T}}}
 end
-FSM{T}() where T = FSM{T}(State{T}[], Dict{State, Vector{Link{T}}}())
+FSM{T}() where T = FSM{T}(State{T}[], Dict{State, Vector{Arc{T}}}())
 FSM() = FSM{LogSemifield{Float64}}()
 
 states(fsm::FSM) = fsm.states
-links(fsm::FSM{T}, state::State{T}) where T = get(fsm.links, state, Link{T}[])
+arcs(fsm::FSM{T}, state::State{T}) where T = get(fsm.arcs, state, Arc{T}[])
+@deprecate links(fsm, state) arcs(fsm, state)
 
 function addstate!(fsm::FSM{T}; initweight = zero(T), finalweight = zero(T),
                    pdfindex = nothing, label = nothing) where T
@@ -48,18 +49,20 @@ function addstate!(fsm::FSM{T}; initweight = zero(T), finalweight = zero(T),
     s
 end
 
-function link!(fsm::FSM{T}, src::State{T}, dest::State{T}, weight::T = one(T)) where T
-    list = get(fsm.links, src, Link{T}[])
-    link = Link{T}(dest, weight)
-    push!(list, link)
-    fsm.links[src] = list
-    link
+function addarc!(fsm::FSM{T}, src::State{T}, dest::State{T}, weight::T = one(T)) where T
+    list = get(fsm.arcs, src, Arc{T}[])
+    arc = Arc{T}(dest, weight)
+    push!(list, arc)
+    fsm.arcs[src] = list
+    arc
 end
+@deprecate link!(fsm, src, dest) addarc!(fsm, src, dest)
+@deprecate link!(fsm, src, dest, weight) addarc!(fsm, src, dest, weight)
 
 function Base.show(io::IO, fsm::FSM)
     nstates = length(fsm.states)
-    nlinks = sum(length, values(fsm.links))
-    print(io, "$(typeof(fsm)) # states: $nstates # links: $nlinks")
+    narcs = sum(length, values(fsm.arcs))
+    print(io, "$(typeof(fsm)) # states: $nstates # arcs: $narcs")
 end
 
 function Base.show(io::IO, ::MIME"image/svg+xml", fsm::FSM)
@@ -89,10 +92,10 @@ function Base.show(io::IO, ::MIME"image/svg+xml", fsm::FSM)
     end
 
     for src in states(fsm)
-        for link in links(fsm, src)
-            weight = round(convert(Float64, link.weight), digits = 3)
+        for arc in arcs(fsm, src)
+            weight = round(convert(Float64, arc.weight), digits = 3)
             srcname = "$(src.id)"
-            destname = "$(link.dest.id)"
+            destname = "$(arc.dest.id)"
             write(dotfile, "$srcname -> $destname [ label=\"$(weight)\" ];\n")
         end
     end
@@ -131,14 +134,14 @@ function Base.union(fsm1::FSM{T}, fsm2::FSM{T}) where T
     end
 
     for src in states(fsm1)
-        for link in links(fsm1, src)
-            link!(newfsm, smap[src], smap[link.dest], link.weight)
+        for arc in arcs(fsm1, src)
+            arc!(newfsm, smap[src], smap[arc.dest], arc.weight)
         end
     end
 
     for src in states(fsm2)
-        for link in links(fsm2, src)
-            link!(newfsm, smap[src], smap[link.dest], link.weight)
+        for arc in arcs(fsm2, src)
+            addarc!(newfsm, smap[src], smap[arc.dest], arc.weight)
         end
     end
 
@@ -159,8 +162,8 @@ function renormalize!(fsm::FSM{T}) where T
 
     for src in states(fsm)
         total = src.finalweight
-        for link in links(fsm, src) total += link.weight end
-        for link in links(fsm, src) link.weight /= total end
+        for arc in arcs(fsm, src) total += arc.weight end
+        for arc in arcs(fsm, src) arc.weight /= total end
         src.finalweight /= total
     end
 
@@ -197,8 +200,8 @@ function Base.replace(fsm::FSM{T}, subfsms::Dict, delim = "!") where T
             end
 
             for cs in states(subfsms[matchlabel(s.label)])
-                for link in links(subfsms[matchlabel(s.label)], cs)
-                    link!(newfsm, smap[cs], smap[link.dest], link.weight)
+                for arc in arcs(subfsms[matchlabel(s.label)], cs)
+                    addarc!(newfsm, smap[cs], smap[arc.dest], arc.weight)
                 end
             end
 
@@ -211,10 +214,10 @@ function Base.replace(fsm::FSM{T}, subfsms::Dict, delim = "!") where T
     end
 
     for osrc in states(fsm)
-        for link in links(fsm, osrc)
+        for arc in arcs(fsm, osrc)
             src = smap_out[osrc]
-            dest = smap_in[link.dest]
-            link!(newfsm, src, dest, link.weight)
+            dest = smap_in[arc.dest]
+            arc!(newfsm, src, dest, arc.weight)
         end
     end
 
@@ -245,7 +248,7 @@ Determinize the FSM w.r.t. the state labels.
 function determinize(fsm::FSM{T}) where T
     newfsm = FSM{T}()
     smap = Dict()
-    newlinks = Dict()
+    newarcs = Dict()
 
     initstates = [(s, zero(T)) for s in filter(isinit, collect(states(fsm)))]
     queue = _unique_labels(initstates, T, 0, init = true)
@@ -266,24 +269,24 @@ function determinize(fsm::FSM{T}) where T
 
         nextstates = []
         for ls in lstates
-            for link in links(fsm, ls)
-                push!(nextstates, (link.dest, link.weight))
+            for arc in arcs(fsm, ls)
+                push!(nextstates, (arc.dest, arc.weight))
             end
         end
 
         nextlabels = _unique_labels(nextstates, T, step+1, init = false)
         for (key2, value2) in nextlabels
-            w = get(newlinks, (key,key2), zero(T))
-            newlinks[(key,key2)] = w+value2[end]
+            w = get(newarcs, (key,key2), zero(T))
+            newarcs[(key,key2)] = w+value2[end]
         end
         queue = merge(queue, nextlabels)
     end
 
-    for (key, value) in newlinks
+    for (key, value) in newarcs
         src = smap[key[1]]
         dest = smap[key[2]]
         weight = value
-        link!(newfsm, src, dest, weight)
+        addarc!(newfsm, src, dest, weight)
     end
 
     newfsm
@@ -304,8 +307,8 @@ function Base.transpose(fsm::FSM{T}) where T
     end
 
     for src in states(fsm)
-        for link in links(fsm, src)
-            link!(newfsm, smap[link.dest], smap[src], link.weight)
+        for arc in arcs(fsm, src)
+            arc!(newfsm, smap[arc.dest], smap[src], arc.weight)
         end
     end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index e79aeae..016059a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -20,11 +20,11 @@ function makefsm(SF, S)
 
     prev = addstate!(fsm, pdfindex = 1)
     setinit!(prev)
-    link!(fsm, prev, prev)
+    addarc!(fsm, prev, prev)
     for s in 2:S
         state = addstate!(fsm, pdfindex = s)
-        link!(fsm, prev, state)
-        link!(fsm, state, state)
+        addarc!(fsm, prev, state)
+        addarc!(fsm, state, state)
         prev = state
     end
     setfinal!(prev)

From 1179ebe4d83294cd734d720dbfd51ca607ef6c18 Mon Sep 17 00:00:00 2001
From: Lucas Ondel <lucas.ondel@gmail.com>
Date: Tue, 3 Aug 2021 18:49:05 +0200
Subject: [PATCH 4/6] new version of the batched forward-backward

---
 src/MarkovModels.jl |   2 +-
 src/algorithms.jl   | 139 ++++++++++++++++++++++----------------------
 src/cfsm.jl         | 127 ++++++++++++++++++++++++++--------------
 src/semifields.jl   |   3 +-
 test/runtests.jl    |  50 +++++-----------
 5 files changed, 168 insertions(+), 153 deletions(-)

diff --git a/src/MarkovModels.jl b/src/MarkovModels.jl
index b1c9734..6ab1000 100644
--- a/src/MarkovModels.jl
+++ b/src/MarkovModels.jl
@@ -51,7 +51,7 @@ Inference algorithms.
 
 include("algorithms.jl")
 
-export stateposteriors
+export pdfposteriors
 export bestpath
 
 end
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 952febb..7e281bb 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -1,119 +1,116 @@
 # SPDX-License-Identifier: MIT
 
-const Abstract3DTensor{T} = AbstractArray{T,3} where T
-
 #======================================================================
 Forward recursion
 ======================================================================#
 
-function αrecursion!(α::AbstractMatrix{T}, π::AbstractVector{T},
-                     Aᵀ::AbstractMatrix{T}, lhs::AbstractMatrix) where T
-    N = size(lhs, 2)
+function αrecursion(π::AbstractVector{SF},
+                    Tᵀ::AbstractMatrix{SF},
+                    lhs::AbstractMatrix{SF}) where SF <: Semifield
+    S, N = length(π), size(lhs, 2)
+    α = similar(lhs, SF, S, N)
     buffer = similar(lhs[:, 1])
+
     @views elmul!(α[:,1], π, lhs[:,1])
     @views for n in 2:N
-        matmul!(buffer, Aᵀ, α[:,n-1])
+        matmul!(buffer, Tᵀ, α[:,n-1])
         elmul!(α[:,n], buffer, lhs[:,n])
     end
     α
 end
 
-function αrecursion!(α::Abstract3DTensor{T}, π::AbstractVector{T},
-                     Aᵀ::AbstractMatrix{T}, lhs::Abstract3DTensor{T}) where T
-    N = size(lhs, 2)
-    buffer = similar(α[:,1,:])
-    @views elmul!(α[:,1,:], π, lhs[:,1,:])
-    @views for n in 2:N
-        matmul!(buffer, Aᵀ, α[:,n-1,:])
-        elmul!(α[:,n,:], buffer, lhs[:,n,:])
-    end
-    α
-end
-
 #======================================================================
 Backward recursion
 ======================================================================#
 
-function βrecursion!(β::AbstractMatrix{T}, ω::AbstractVector{T},
-                     A::AbstractMatrix{T}, lhs::AbstractMatrix{T}) where T
-    N = size(lhs, 2)
+function βrecursion(ω::AbstractVector{SF},
+                    T::AbstractMatrix{SF},
+                    lhs::AbstractMatrix{SF}) where SF <: Semifield
+    S, N = length(ω), size(lhs, 2)
+    β = similar(lhs, SF, S, N)
     buffer = similar(lhs[:, 1])
+
     β[:, end] = ω
     @views for n in N-1:-1:1
         elmul!(buffer, β[:,n+1], lhs[:,n+1])
-        matmul!(β[:,n], A, buffer)
-    end
-    β
-end
-
-function βrecursion!(β::Abstract3DTensor{T}, ω::AbstractVector{T},
-                     A::AbstractMatrix{T}, lhs::Abstract3DTensor{T}) where T
-    N = size(lhs, 2)
-    buffer = fill!(similar(β[:,1,:]), one(T))
-    @views elmul!(β[:,end,:], ω, buffer)
-    @views for n in N-1:-1:1
-        elmul!(buffer, β[:,n+1,:], lhs[:,n+1,:])
-        matmul!(β[:,n,:], A, buffer)
+        matmul!(β[:,n], T, buffer)
     end
     β
 end
 
 #======================================================================
-Generic forward-backward algorithm
+Specialized algorithms
 ======================================================================#
 
-function αβrecursion(π::AbstractVector{T}, ω::AbstractVector{T},
-                     A::AbstractMatrix{T}, Aᵀ::AbstractMatrix,
-                     lhs::AbstractMatrix{T}) where T
-    S, N = length(π), size(lhs, 2)
-    α = similar(lhs, T, S, N)
-    β = similar(lhs, T, S, N)
-
-    αrecursion!(α, π, Aᵀ, lhs)
-    βrecursion!(β, ω, A, lhs)
+function pdfposteriors(in_cfsm::CompiledFSM,
+                       in_lhs::AbstractMatrix{T}) where T <: Real
+    # Convert the FSM and the likelihood matrix to the Log-semifield.
+    SF = LogSemifield{T}
+    cfsm = convert(CompiledFSM{SF}, in_cfsm)
+    lhs = copyto!(similar(in_lhs, SF), in_lhs)
 
-    γ = similar(lhs, T, S, N)
-    elmul!(γ, α, β)
-    sums = sum(γ, dims = 1)
-    eldiv!(γ, γ, sums)
+    S = size(cfsm.C, 1)
+    N = size(lhs, 2)
 
-    γ, minimum(sums)
-end
+    # Expand the likelihood matrix to get the per-state likelihoods.
+    state_lhs = matmul!(similar(lhs, S, N), cfsm.C, lhs)
 
-function αβrecursion(π::AbstractVector{T}, ω::AbstractVector{T},
-                     A::AbstractMatrix{T}, Aᵀ::AbstractMatrix,
-                     lhs::Abstract3DTensor{T}) where T
-    S, N, B = size(lhs)
-    α = similar(lhs, T, S, N, B)
-    β = similar(lhs, T, S, N, B)
+    α = αrecursion(cfsm.π, cfsm.Tᵀ, state_lhs)
+    β = βrecursion(cfsm.ω, cfsm.T, state_lhs)
+    state_γ = elmul!(similar(state_lhs), α, β)
 
-    αrecursion!(α, π, Aᵀ, lhs)
-    βrecursion!(β, ω, A, lhs)
+    # Transform the per-state γs to per-likelihoods γs.
+    γ = matmul!(lhs, cfsm.Cᵀ, state_γ) # re-use `lhs` memory.
 
-    γ = similar(lhs, T, S, N, B)
-    elmul!(γ, α, β)
+    # Re-normalize the γs.
     sums = sum(γ, dims = 1)
     eldiv!(γ, γ, sums)
 
-    γ, dropdims(minimum(sums, dims = (1, 2)), dims = (1, 2))
+    # Convert the result to the Real-semiring
+    out = copyto!(similar(in_lhs), γ)
+
+    exp.(out), convert(T, minimum(sums))
 end
 
-#======================================================================
-Specialized algorithms
-======================================================================#
+function pdfposteriors(in_ucfsm::UnionCompiledFSM,
+                       in_lhs::AbstractArray{T,3}) where T <: Real
 
-_convert(T, ttl::Number) = convert(T, ttl)
-_convert(T, ttl::AbstractVector) = copyto!(similar(ttl, T), ttl)
+    # Reshape the 3D tensor to have a matrix of size BK x N where
+    # B is the number of elements in the batch.
+    in_lhs_matrix = vcat(eachslice(in_lhs, dims = 3)...)
 
-function stateposteriors(in_cfsm, in_lhs::AbstractArray{T}) where T <: Real
+    # Convert the FSM and the likelihood matrix to the Log-semifield.
     SF = LogSemifield{T}
-    cfsm = convert(CompiledFSM{SF}, in_cfsm)
-    lhs = copyto!(similar(in_lhs, SF), in_lhs)
-    γ, ttl = αβrecursion(cfsm.π, cfsm.ω, cfsm.A, cfsm.Aᵀ, lhs)
+    cfsm = convert(CompiledFSM{SF}, in_ucfsm.cfsm)
+    lhs = copyto!(similar(in_lhs_matrix, SF), in_lhs_matrix)
+
+    S = size(cfsm.C, 1)
+    K, N = size(in_lhs)
+
+    # Expand the likelihood matrix to get the per-state likelihoods.
+    state_lhs = matmul!(similar(lhs, S, N), cfsm.C, lhs)
+
+    α = αrecursion(cfsm.π, cfsm.Tᵀ, state_lhs)
+    β = βrecursion(cfsm.ω, cfsm.T, state_lhs)
+    state_γ = elmul!(similar(state_lhs), α, β)
+
+    # Transform the per-state γs to per-likelihoods γs.
+    γ = matmul!(lhs, cfsm.Cᵀ, state_γ) # re-use `lhs` memory.
+    γ = permutedims(reshape(γ, K, :, N), (1, 3, 2))
+
+    # Re-normalize each element of the batch.
+    sums = sum(γ, dims = 1)
+    eldiv!(γ, γ, sums)
+
+    # Convert the result to the Real-semiring
     out = copyto!(similar(in_lhs), γ)
-    exp.(out), _convert(T, ttl)
+
+    ttl = dropdims(minimum(sums, dims = (1, 2)), dims = (1, 2))
+    exp.(out), copyto!(similar(ttl, T), ttl)
 end
 
+@deprecate stateposteriors(cfsm, lhs) pdfposteriors(cfsm, lhs)
+
 function bestpath(in_cfsm, in_lhs::AbstractArray{T}) where T <: Real
     SF = TropicalSemifield{T}
     cfsm = convert(CompiledFSM{SF}, in_cfsm)
diff --git a/src/cfsm.jl b/src/cfsm.jl
index fc1bc3d..9fdaf27 100644
--- a/src/cfsm.jl
+++ b/src/cfsm.jl
@@ -4,54 +4,73 @@
     struct CompiledFSM{T<:Semifield}
         π        # vector of initial probabilities
         ω        # vector of final probabilities
-        A        # matrix of transition probabilities
-        Aᵀ       # transpose of `A`
-        pdfmap   # mapping state -> pdfindex
+        T        # matrix of transition probabilities
+        Tᵀ       # transpose of `A`
+        C        # matrix mapping state -> pdfindex
+        Cᵀ       # tranpose of `C`
     end
 
 Compiled FSM: matrix/vector format of an FSM used by inference
-algorithms. Note that in this form, every state is an emitting state.
+algorithms. All the fields are stored in sparse containers.
 """
-struct CompiledFSM{T<:Semifield}
-    π::AbstractVector{T}
-    ω::AbstractVector{T}
-    A::AbstractMatrix{T}
-    Aᵀ::AbstractMatrix{T}
-    pdfmap::AbstractVector{Int}
+struct CompiledFSM{SF<:Semifield}
+    π::AbstractSparseVector{SF}
+    ω::AbstractSparseVector{SF}
+    T::AbstractSparseMatrix{SF}
+    Tᵀ::AbstractSparseMatrix{SF}
+    C::AbstractSparseMatrix{SF}
+    Cᵀ::AbstractSparseMatrix{SF}
+end
+
+function Base.show(io::IO, cfsm::CompiledFSM)
+    nstates = length(cfsm.π)
+    narcs = length(nonzeros(cfsm.T))
+    print(io, "$(typeof(cfsm)) # states: $nstates # arcs: $narcs")
 end
 
 """
-    compile(fsm; allocator = spzeros)
+    compile(fsm, K)
 
 Compile `fsm` into a inference-friendly format [`CompiledFSM`](@ref).
-`allocator` is a function analogous to `zeros` which creates an
-N-dimensional array and fills it with zero elements.
+`K` is the total number of emission pdfs. Note that the fsm, is not
+requested to use all the pdf indices.
+
+!!! warning
+    This function assumes that all states of `fsm` are associated to a
+    pdf index.
+
 """
-function compile(fsm::FSM{T}; allocator = spzeros) where T
+function compile(fsm::FSM{SF}, K::Integer) where SF
     allstates = collect(states(fsm))
     S = length(allstates)
 
     # Initial states' probabilities.
-    π = allocator(T, S)
+    π = spzeros(SF, S)
     for s in filter(isinit, allstates) π[s.id] = s.initweight end
 
     # Final states' probabilities.
-    ω = allocator(T, S)
+    ω = spzeros(SF, S)
     for s in filter(isfinal, allstates) ω[s.id] = s.finalweight end
 
     # Transition matrix.
-    A = allocator(T, S, S)
-    Aᵀ = allocator(T, S, S)
+    T = spzeros(SF, S, S)
+    Tᵀ = spzeros(SF, S, S)
     for src in allstates
         for arc in arcs(fsm, src)
-            A[src.id, arc.dest.id] = arc.weight
-            Aᵀ[arc.dest.id, src.id] = arc.weight
+            T[src.id, arc.dest.id] = arc.weight
+            Tᵀ[arc.dest.id, src.id] = arc.weight
         end
     end
 
-    pdfmap = [s.pdfindex for s in sort(allstates, by = p -> p.id)]
+    # Connection matrix.
+    C = spzeros(SF, S, K)
+    Cᵀ = spzeros(SF, K, S)
+    for s in allstates
+        C[s.id, s.pdfindex] = one(SF)
+        Cᵀ[s.pdfindex, s.id] = one(SF)
+    end
 
-    CompiledFSM{T}(π, ω, A, Aᵀ, pdfmap)
+    CompiledFSM{SF}(π, ω, T, Tᵀ, C, Cᵀ)
 end
 
 """
@@ -59,35 +78,57 @@ end
 
 Move the compiled fsm `cfsm` to GPU.
 """
-function gpu(cfsm::CompiledFSM{T}) where T
-    if ! issparse(cfsm.π)
-        return CompiledFSM{T}(
-            CuArray(cfsm.π),
-            CuArray(cfsm.ω),
-            CuArray(cfsm.A),
-            CuArray(cfsm.Aᵀ),
-            CuArray(cfsm.pdfmap)
-        )
-    end
-
-    A = CuSparseMatrixCSC(cfsm.A)
-    Aᵀ = CuSparseMatrixCSC(cfsm.Aᵀ)
-    return CompiledFSM{T}(
+function gpu(cfsm::CompiledFSM{SF}) where SF
+    T = CuSparseMatrixCSC(cfsm.T)
+    Tᵀ = CuSparseMatrixCSC(cfsm.Tᵀ)
+    C = CuSparseMatrixCSC(cfsm.C)
+    Cᵀ = CuSparseMatrixCSC(cfsm.Cᵀ)
+    return CompiledFSM{SF}(
         CuSparseVector(cfsm.π),
         CuSparseVector(cfsm.ω),
-        CuSparseMatrixCSR(Aᵀ.colPtr, Aᵀ.rowVal, Aᵀ.nzVal, A.dims),
-        CuSparseMatrixCSR(A.colPtr, A.rowVal, A.nzVal, A.dims),
-        CuArray(cfsm.pdfmap)
+        CuSparseMatrixCSR(Tᵀ.colPtr, Tᵀ.rowVal, Tᵀ.nzVal, T.dims),
+        CuSparseMatrixCSR(T.colPtr, T.rowVal, T.nzVal, Tᵀ.dims),
+        CuSparseMatrixCSR(Cᵀ.colPtr, Cᵀ.rowVal, Cᵀ.nzVal, C.dims),
+        CuSparseMatrixCSR(C.colPtr, C.rowVal, C.nzVal, Cᵀ.dims),
     )
 end
 
 function Base.convert(::Type{CompiledFSM{NT}}, cfsm) where NT <: Semifield
-    CompiledFSM(
+    CompiledFSM{NT}(
         copyto!(similar(cfsm.π, NT), cfsm.π),
         copyto!(similar(cfsm.ω, NT), cfsm.ω),
-        copyto!(similar(cfsm.A, NT), cfsm.A),
-        copyto!(similar(cfsm.Aᵀ, NT), cfsm.Aᵀ),
-        copyto!(similar(cfsm.pdfmap, Int), cfsm.pdfmap)
+        copyto!(similar(cfsm.T, NT), cfsm.T),
+        copyto!(similar(cfsm.Tᵀ, NT), cfsm.Tᵀ),
+        copyto!(similar(cfsm.C, NT), cfsm.C),
+        copyto!(similar(cfsm.Cᵀ, NT), cfsm.Cᵀ),
     )
 end
 Base.convert(::Type{T}, cfsm::T) where T <: CompiledFSM = cfsm
+
+struct UnionCompiledFSM{SF<:Semifield,U}
+    cfsm::CompiledFSM{SF}
+end
+
+function Base.show(io::IO, ucfsm::UnionCompiledFSM)
+    nstates = length(ucfsm.cfsm.π)
+    narcs = length(nonzeros(ucfsm.cfsm.T))
+    print(io, "$(typeof(ucfsm)) # states: $nstates # arcs: $narcs")
+end
+
+function Base.union(cfsms::CompiledFSM{SF}...) where SF
+    U = length(cfsms)
+    UnionCompiledFSM{SF,U}(
+        CompiledFSM{SF}(
+            vcat(map(x -> x.π, cfsms)...),
+            vcat(map(x -> x.ω, cfsms)...),
+            blockdiag(map(x -> x.T, cfsms)...),
+            blockdiag(map(x -> x.Tᵀ, cfsms)...),
+            blockdiag(map(x -> x.C, cfsms)...),
+            blockdiag(map(x -> x.Cᵀ, cfsms)...),
+        )
+    )
+end
+
+function gpu(ucfsm::UnionCompiledFSM{SF,U}) where {SF,U}
+    UnionCompiledFSM{SF,U}(ucfsm.cfsm |> gpu)
+end
diff --git a/src/semifields.jl b/src/semifields.jl
index c5ea9f5..9817fc9 100644
--- a/src/semifields.jl
+++ b/src/semifields.jl
@@ -19,8 +19,7 @@ end
 abstract type Semifield <: Number end
 
 Base.convert(T::Type{<:Number}, x::Semifield) = T(x.val)
-Base.promote(x::SF, y::Real) where SF <: Semifield = x, SF(y)
-#Base.promote(x::Real, y::SF) where SF <: Semifield = SF(x), y
+Base.promote_rule(x::Type{T}, y::Type{<:Real}) where T <: Semifield = T
 Base.show(io::IO, x::Semifield) = print(io, x.val)
 
 #======================================================================
diff --git a/test/runtests.jl b/test/runtests.jl
index 016059a..d77ad58 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,6 +9,7 @@ using Test
 const D = 5 # vector/matrix dimension
 const B = 2 # batch size
 const S = 3 # number of states
+const K = 3 # Number of emissions
 const N = 5 # number of frames
 
 const T = Float64
@@ -164,33 +165,23 @@ end
     lhs = ones(T, S, N)
 
     fsm = makefsm(SF, S)
-    cfsm = compile(fsm, allocator = zeros)
+    cfsm = compile(fsm, K)
 
     γ_ref, ttl_ref = forward_backward(
-        convert(Matrix{T}, cfsm.A),
-        convert(Matrix{T}, cfsm.Aᵀ),
+        convert(Matrix{T}, cfsm.T),
+        convert(Matrix{T}, cfsm.Tᵀ),
         convert(Vector{T}, cfsm.π),
         convert(Vector{T}, cfsm.ω),
         convert(Matrix{T}, lhs)
     )
 
-    γ_dcpu, ttl_dcpu = @inferred stateposteriors(cfsm, lhs)
-    @test all(γ_ref .≈ γ_dcpu)
-    @test ttl_ref ≈ ttl_dcpu
-
-    cfsm = compile(fsm, allocator = spzeros)
-    γ_scpu, ttl_scpu = @inferred stateposteriors(cfsm, lhs)
+    γ_scpu, ttl_scpu = pdfposteriors(cfsm, lhs)
     @test all(γ_ref .≈ γ_scpu)
     @test ttl_ref ≈ ttl_scpu
 
     if CUDA.functional()
-        cfsm = compile(fsm, allocator = zeros) |> gpu
-        γ_dgpu, ttl_dgpu = @inferred stateposteriors(cfsm, CuArray(lhs))
-        @test all(γ_ref .≈ convert(Matrix{T}, γ_dgpu))
-        @test ttl_ref ≈ ttl_dgpu
-
-        cfsm = compile(fsm, allocator = spzeros) |> gpu
-        γ_sgpu, ttl_sgpu = @inferred stateposteriors(cfsm, CuArray(lhs))
+        cfsm = cfsm |> gpu
+        γ_sgpu, ttl_sgpu = pdfposteriors(cfsm, CuArray(lhs))
         @test all(γ_ref .≈ convert(Matrix{T}, γ_sgpu))
         @test ttl_ref ≈ ttl_sgpu
     end
@@ -200,39 +191,26 @@ end
     lhs = ones(T, S, N, B)
 
     fsm = makefsm(SF, S)
-    cfsm = compile(fsm, allocator = zeros)
+    cfsm = compile(fsm, K)
 
     γ_ref, ttl_ref = forward_backward(
-        convert(Matrix{T}, cfsm.A),
-        convert(Matrix{T}, cfsm.Aᵀ),
+        convert(Matrix{T}, cfsm.T),
+        convert(Matrix{T}, cfsm.Tᵀ),
         convert(Vector{T}, cfsm.π),
         convert(Vector{T}, cfsm.ω),
         convert(Matrix{T}, lhs[:, :, 1])
     )
 
-    γ_dcpu, ttl_dcpu = @inferred stateposteriors(cfsm, lhs)
-    for b in 1:B
-        @test all(γ_ref .≈ γ_dcpu[:,:,b])
-        @test ttl_ref ≈ ttl_dcpu[b]
-    end
-
-    cfsm = compile(fsm, allocator = spzeros)
-    γ_scpu, ttl_scpu = @inferred stateposteriors(cfsm, lhs)
+    cfsms = union([cfsm for i in 1:B]...)
+    γ_scpu, ttl_scpu = pdfposteriors(cfsms, lhs)
     for b in 1:B
         @test all(γ_ref .≈ γ_scpu[:,:,b])
         @test ttl_ref ≈ ttl_scpu[b]
     end
 
     if CUDA.functional()
-        cfsm = compile(fsm, allocator = zeros) |> gpu
-        γ_dgpu, ttl_dgpu = @inferred stateposteriors(cfsm, CuArray(lhs))
-        for b in 1:B
-            @test all(γ_ref .≈ convert(Array{T,3}, γ_dgpu)[:,:,b])
-            @test ttl_ref ≈ convert(Vector{T}, ttl_dgpu)[b]
-        end
-
-        cfsm = compile(fsm, allocator = spzeros) |> gpu
-        γ_sgpu, ttl_sgpu = @inferred stateposteriors(cfsm, CuArray(lhs))
+        cfsms = cfsms |> gpu
+        γ_sgpu, ttl_sgpu = pdfposteriors(cfsms, CuArray(lhs))
         for b in 1:B
             @test all(γ_ref .≈ convert(Array{T,3}, γ_sgpu)[:,:,b])
             @test ttl_ref ≈ convert(Vector{T}, ttl_sgpu)[b]

From 3b6963fa453e8946a575bee0262e6a0e2a36da5c Mon Sep 17 00:00:00 2001
From: Lucas Ondel <lucas.ondel@gmail.com>
Date: Thu, 5 Aug 2021 15:51:03 +0200
Subject: [PATCH 5/6] added documentation for the pdfposterior function

---
 docs/src/inference.md | 11 ++---------
 src/algorithms.jl     | 13 +++++++++++++
 test/runtests.jl      | 12 ++++++------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/docs/src/inference.md b/docs/src/inference.md
index 5b9a48a..322d6e7 100644
--- a/docs/src/inference.md
+++ b/docs/src/inference.md
@@ -1,14 +1,7 @@
 # Inference
 
-## Baum-Welch algoritm (forward-backward)
-
-The Baum-Welch algorithm computes the probability to be in a state `i`
-at time $n$:
-```math
-p(z_n = i | x_1, ..., x_N)
-```
-It is implemented by the [`αβrecursion`](@ref) function.
+## Basic algorithms
 
 ```@docs
-αβrecursion
+pdfposteriors
 ```
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 7e281bb..d98995b 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -42,6 +42,19 @@ end
 Specialized algorithms
 ======================================================================#
 
+"""
+    pdfposteriors(cfsm, lhs)
+    pdfposteriors(union(cfsm1, cfsm2, ...), batch_lhs)
+
+Calculate the conditional posterior of "assigning" the $n$th frame
+to the $i$th pdf. The output is a tuple `γ, ttl` where `γ` is a matrix
+(# pdf x # frames) and `ttl` is the total probability of the sequence.
+This function can also be caused in "batch-mode" by providing a union
+of compiled fsm and a 3D tensor containing the per-state, per-frame
+and per-batch values.
+"""
+pdfposteriors
+
 function pdfposteriors(in_cfsm::CompiledFSM,
                        in_lhs::AbstractMatrix{T}) where T <: Real
     # Convert the FSM and the likelihood matrix to the Log-semifield.
diff --git a/test/runtests.jl b/test/runtests.jl
index c40e1d9..238120a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -228,9 +228,9 @@ end
     setinit!(s1)
     setfinal!(s3)
 
-    link!(fsm, s1, s2)
-    link!(fsm, s2, s3)
-    link!(fsm, s3, s1)
+    addarc!(fsm, s1, s2)
+    addarc!(fsm, s2, s3)
+    addarc!(fsm, s3, s1)
 
     ns = collect(filter(s -> ! MarkovModels.isemitting(s),
                         MarkovModels.states(fsm |> remove_eps)))
@@ -257,9 +257,9 @@ end
     setinit!(s1)
     setfinal!(s3)
 
-    link!(fsm, s1, s2)
-    link!(fsm, s2, s3)
-    link!(fsm, s3, s1)
+    addarc!(fsm, s1, s2)
+    addarc!(fsm, s2, s3)
+    addarc!(fsm, s3, s1)
 
     @test_throws MarkovModels.InvalidFSMError determinize(fsm)
 end

From e7d494b7768271b8a71f0440ee947da9368addb6 Mon Sep 17 00:00:00 2001
From: Lucas Ondel <lucas.ondel@gmail.com>
Date: Thu, 5 Aug 2021 15:57:18 +0200
Subject: [PATCH 6/6] escaped dollar symbols

---
 src/algorithms.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/algorithms.jl b/src/algorithms.jl
index d98995b..8118758 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -46,8 +46,8 @@ Specialized algorithms
     pdfposteriors(cfsm, lhs)
     pdfposteriors(union(cfsm1, cfsm2, ...), batch_lhs)
 
-Calculate the conditional posterior of "assigning" the $n$th frame
-to the $i$th pdf. The output is a tuple `γ, ttl` where `γ` is a matrix
+Calculate the conditional posterior of "assigning" the \$n\$th frame
+to the \$i\$th pdf. The output is a tuple `γ, ttl` where `γ` is a matrix
 (# pdf x # frames) and `ttl` is the total probability of the sequence.
 This function can also be caused in "batch-mode" by providing a union
 of compiled fsm and a 3D tensor containing the per-state, per-frame