In [1]:
] activate ~/.julia/environments/v1.5/Project.toml

[32m[1m Activating[22m[39m environment at `~/.julia/environments/v1.5/Project.toml`


In [2]:
using CUDA

In [3]:
N = 128

128

In [11]:
? fill

search: [0m[1mf[22m[0m[1mi[22m[0m[1ml[22m[0m[1ml[22m [0m[1mf[22m[0m[1mi[22m[0m[1ml[22m[0m[1ml[22m! [0m[1mf[22m[0m[1mi[22mna[0m[1ml[22m[0m[1ml[22my [0m[1mf[22m[0m[1mi[22mnda[0m[1ml[22m[0m[1ml[22m [0m[1mf[22m[0m[1mi[22m[0m[1ml[22mter [0m[1mf[22m[0m[1mi[22m[0m[1ml[22mter! [0m[1mf[22m[0m[1mi[22m[0m[1ml[22mesize [0m[1mf[22m[0m[1mi[22m[0m[1ml[22memode is[0m[1mf[22m[0m[1mi[22m[0m[1ml[22me



```
fill(x, dims::Tuple)
fill(x, dims...)
```

Create an array filled with the value `x`. For example, `fill(1.0, (5,5))` returns a 5×5 array of floats, with each element initialized to `1.0`.

`dims` may be specified as either a tuple or a sequence of arguments. For example, the common idiom `fill(x)` creates a zero-dimensional array containing the single value `x`.

# Examples

```jldoctest
julia> fill(1.0, (2,3))
2×3 Array{Float64,2}:
 1.0  1.0  1.0
 1.0  1.0  1.0

julia> fill(42)
0-dimensional Array{Int64,0}:
42
```

If `x` is an object reference, all elements will refer to the same object:

```jldoctest
julia> A = fill(zeros(2), 2);

julia> A[1][1] = 42; # modifies both A[1][1] and A[2][1]

julia> A
2-element Array{Array{Float64,1},1}:
 [42.0, 0.0]
 [42.0, 0.0]
```


In [57]:
x_d = fill(1.0,128)
y_d = fill(2.0,128);

In [58]:
using Flux

In [43]:
import Flux:|>

In [59]:
cu_x_d = x_d |> gpu
cu_y_d = y_d |> gpu;

In [60]:
cu_y_d .+= cu_x_d
@test all(Array(cu_y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [64]:
using BenchmarkTools, ForwardDiff, LinearAlgebra, Random

In [72]:
function transition_M!(F::Array{TT,2}, σ2::TT, λ::TT, μ::TT, dx::UU,
        xc::Vector{TT}, n::Int, dt::Float64) where {TT,UU <: Any}

    F[1,1] = one(TT); F[n,n] = one(TT); F[:,2:n-1] = zeros(TT,n,n-2)

    ndeltas = max(70,ceil(Int, 10. *sqrt(σ2)/dx))

    deltaidx = collect(-ndeltas:ndeltas)
    deltas = deltaidx * (5. *sqrt(σ2))/ndeltas
    ps = exp.(-0.5 * (5*deltaidx./ndeltas).^2)
    ps = ps/sum(ps)

    @inbounds for j = 2:n-1

        #abs(λ) < 1e-150 ? mu = xc[j] + μ : mu = exp(λ*dt)*(xc[j] + μ/(λ*dt)) - μ/(λ*dt)
        #abs(λ) < 1e-150 ? mu = xc[j] + h * dt : mu = exp(λ*dt)*(xc[j] + h/λ) - h/λ
        #mu = exp(λ*dt)*xc[j] + μ * (exp(λ*dt) - 1.)/(λ*dt)
        #mu = exp(λ*dt)*xc[j] + μ * (expm1(λ*dt)/(λ*dt)
        mu = exp(λ*dt)*xc[j] + μ * expm1_div_x(λ*dt)

        #now we're going to look over all the slices of the gaussian
        for k = 1:2*ndeltas+1

            s = mu + deltas[k]

            if s <= xc[1]

                F[1,j] += ps[k]

            elseif s >= xc[n]

                F[n,j] += ps[k]

            else

                if (xc[1] < s) && (xc[2] > s)

                    lp,hp = 1,2

                elseif (xc[n-1] < s) && (xc[n] > s)

                    lp,hp = n-1,n

                else

                    hp,lp = ceil(Int, (s-xc[2])/dx) + 2, floor(Int, (s-xc[2])/dx) + 2

                end

                if hp == lp

                    F[lp,j] += ps[k]

                else

                    dd = xc[hp] - xc[lp]
                    F[hp,j] += ps[k]*(s-xc[lp])/dd
                    F[lp,j] += ps[k]*(xc[hp]-s)/dd

                end

            end

        end

    end

end

transition_M! (generic function with 1 method)

In [95]:
function transition_M_opt!(P::Array{TT,2}, σ2::TT, λ::TT, μ::Vector{TT,1}, dx::UU,
        xc::Vector{TT}, n::Int, dt::Float64) where {TT,UU <: Any}


    block_idx = Int64(blockIdx().x)
    thread_idx = Int64(threadIdx().x)
    
    if thread_idx > size(P,2)
        return
    end
    
    if block_idx > size(P,1)
        return
    end
    
    i,j = block_idx, thread_idx
    
    F[1,1] = one(TT); F[n,n] = one(TT); F[:,2:n-1] = zeros(TT,n,n-2)

    ndeltas = max(70,ceil(Int, 10. *sqrt(σ2)/dx))

    deltaidx = collect(-ndeltas:ndeltas)
    deltas = deltaidx * (5. *sqrt(σ2))/ndeltas
    ps = exp.(-0.5 * (5*deltaidx./ndeltas).^2)
    ps = ps/sum(ps)

    @inbounds for j = 2:n-1

        #abs(λ) < 1e-150 ? mu = xc[j] + μ : mu = exp(λ*dt)*(xc[j] + μ/(λ*dt)) - μ/(λ*dt)
        #abs(λ) < 1e-150 ? mu = xc[j] + h * dt : mu = exp(λ*dt)*(xc[j] + h/λ) - h/λ
        #mu = exp(λ*dt)*xc[j] + μ * (exp(λ*dt) - 1.)/(λ*dt)
        #mu = exp(λ*dt)*xc[j] + μ * (expm1(λ*dt)/(λ*dt)
        mu = exp(λ*dt)*xc[j] + μ * expm1_div_x(λ*dt)

        #now we're going to look over all the slices of the gaussian
        for k = 1:2*ndeltas+1

            s = mu + deltas[k]

            if s <= xc[1]

                F[1,j] += ps[k]

            elseif s >= xc[n]

                F[n,j] += ps[k]

            else

                if (xc[1] < s) && (xc[2] > s)

                    lp,hp = 1,2

                elseif (xc[n-1] < s) && (xc[n] > s)

                    lp,hp = n-1,n

                else

                    hp,lp = ceil(Int, (s-xc[2])/dx) + 2, floor(Int, (s-xc[2])/dx) + 2

                end

                if hp == lp

                    F[lp,j] += ps[k]

                else

                    dd = xc[hp] - xc[lp]
                    F[hp,j] += ps[k]*(s-xc[lp])/dd
                    F[lp,j] += ps[k]*(xc[hp]-s)/dd

                end

            end

        end

    end

end

transition_M_opt! (generic function with 1 method)

In [96]:
function transition_M(σ2::TT, λ::TT, μ::TT, dx::UU,
        xc::Vector{TT}, n::Int, dt::Float64) where {TT,UU <: Any}

    M = zeros(TT,n,n)
    transition_M_opt!(M,σ2,λ,μ,dx,xc,n,dt)

    return M

end

transition_M (generic function with 1 method)

In [None]:
function latent_one_step!(P::Vector{TT}, F::Array{TT,2}, λ::TT, σ2_a::TT, σ2_s::TT,
        t::Int, nL::Vector{Int}, nR::Vector{Int},
        La::Vector{TT}, Ra::Vector{TT}, M::Array{TT,2},
        dx::UU, xc::Vector{TT}, n::Int, dt::Float64) where {TT,UU <: Any}

    any(t .== nL) ? sL = sum(La[t .== nL]) : sL = zero(TT)
    any(t .== nR) ? sR = sum(Ra[t .== nR]) : sR = zero(TT)

    σ2 = σ2_s * (sL + sR);   μ = -sL + sR
    
    if (sL + sR) > zero(TT)
        transition_M!(F,σ2+σ2_a*dt,λ, μ, dx, xc, n, dt)
        P = F * P
    else
        P = M * P
    end

    return P, F

end

In [73]:
using pulse_input_DDM

In [83]:
import pulse_input_DDM:bins,expm1_div_x

In [84]:
θ_generative = θchoice(θz=θz(σ2_i = 5., B = 8., λ = -1., σ2_a = 20., σ2_s = 0.5,
    ϕ = 0.8, τ_ϕ = 0.05), bias=2., lapse=0.05);

In [85]:
n, dt = 53, 1e-2

(53, 0.01)

In [86]:
xc, dx = bins(θ_generative.θz.B, n)

([-8.156862745098039, -7.8431372549019605, -7.529411764705882, -7.215686274509804, -6.901960784313726, -6.588235294117647, -6.2745098039215685, -5.96078431372549, -5.647058823529412, -5.333333333333333  …  5.333333333333333, 5.647058823529412, 5.96078431372549, 6.2745098039215685, 6.588235294117647, 6.901960784313726, 7.215686274509804, 7.529411764705882, 7.8431372549019605, 8.156862745098039], 0.3137254901960784)

In [94]:
@benchmark transition_M(θ_generative.θz.σ2_a, θ_generative.θz.λ, 0., dx, xc, n, dt)

BenchmarkTools.Trial: 
  memory estimate:  3.14 MiB
  allocs estimate:  201772
  --------------
  minimum time:     3.899 ms (0.00% GC)
  median time:      4.222 ms (0.00% GC)
  mean time:        4.879 ms (7.34% GC)
  maximum time:     19.267 ms (76.99% GC)
  --------------
  samples:          1024
  evals/sample:     1

In [71]:
CUDA.allowscalar(true)
Random.seed!(1111)

function tcudiff(N, ::Type{T} = Float32) where T<:Real
  A = rand(T, N, N)

  cuA = A |> gpu

  f(A) = sum(A .+ A*A .+ T(1))
#  f(A) = sum(A .+ A*A)

  @info "test f cpu: $(f(A))"
  (N<5) && @info "test ∇cpu: $(ForwardDiff.gradient(f, A))"
  @btime ForwardDiff.gradient($f, $A)

  @info "test f gpu: $(f(cuA))"
  (N<5) && @info "test ∇gpu: $(ForwardDiff.gradient(f, cuA))"
  @btime ForwardDiff.gradient($f, $cuA)
end

tcudiff(60)

┌ Info: test f cpu: 60024.59
└ @ Main In[71]:12


  362.935 ms (5400 allocations: 107.54 MiB)


┌ Info: test f gpu: 60024.59
└ @ Main In[71]:16
└ @ GPUArrays /home/jg5821/.julia/packages/GPUArrays/Z5nPF/src/host/indexing.jl:64


  49.835 ms (71695 allocations: 3.98 MiB)


60×60 CuArray{Float32,2}:
 61.2084  61.6162  58.6687  61.6423  …  60.9841  66.3073  66.1485  62.3897
 58.8725  59.2803  56.3328  59.3064     58.6482  63.9714  63.8125  60.0538
 60.2208  60.6286  57.6811  60.6547     59.9966  65.3197  65.1609  61.4021
 66.136   66.5438  63.5963  66.57       65.9118  71.235   71.0761  67.3174
 61.5908  61.9986  59.0511  62.0248     61.3666  66.6898  66.5309  62.7721
 57.9839  58.3917  55.4442  58.4178  …  57.7597  63.0828  62.924   59.1652
 59.1025  59.5103  56.5628  59.5365     58.8783  64.2015  64.0426  60.2838
 61.38    61.7878  58.8403  61.814      61.1558  66.479   66.3201  62.5614
 62.3837  62.7915  59.844   62.8177     62.1595  67.4827  67.3238  63.565
 62.8164  63.2242  60.2767  63.2504     62.5922  67.9154  67.7565  63.9978
 63.8342  64.242   61.2945  64.2682  …  63.61    68.9332  68.7743  65.0156
 66.0895  66.4972  63.5497  66.5234     65.8652  71.1884  71.0295  67.2708
 62.3745  62.7823  59.8348  62.8084     62.1503  67.4734  67.3146  63.5558


In [7]:
function hello_world()
   @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x))
   return
end
@cuda blocks=2 threads=2 hello_world()
synchronize()

Greetings from block 2, thread 1!
Greetings from block 2, thread 2!
Greetings from block 1, thread 1!
Greetings from block 1, thread 2!


In [8]:
using Test

In [None]:
x

In [9]:
y_d .+= x_d
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m