## GPU
* Support both gpu and cpu training

In [None]:
using SparseArrays

In [None]:
function device(x)
    gpu(x)
end

# efficiently convert a sparse cpu matrix into a dense CUDA array
function device(x::AbstractSparseArray)
    CUDA.functional() ? CUDA.CuArray(gpu(x)) : collect(x)
end

if !CUDA.functional()
    LinearAlgebra.BLAS.set_num_threads(Threads.nthreads())
end;

# some models have custom semantics where non-trainable parameters are on the gpu
# we handle this by overriding the move to gpu semantics
function device(m::Chain)
    Chain([device(m[i]) for i in 1:length(m)])
end

function Flux.cpu(m::Chain)
    Chain([cpu(m[i]) for i in 1:length(m)])
end;