## GPU
* Support both gpu and cpu training

In [None]:
if !CUDA.functional()
    LinearAlgebra.BLAS.set_num_threads(Threads.nthreads())
end;

In [None]:
device(x) = gpu(x)

# efficiently convert a sparse cpu matrix into a dense CUDA array
device(x::SparseArrays.AbstractSparseArray) = CUDA.functional() ? CUDA.CuArray(gpu(x)) : collect(x)
device(x::Tuple) = device.(x)

# some models have custom semantics where non-trainable parameters are on the gpu
# we handle this by overriding the move to gpu semantics
device(m::Chain) = Chain([device(m[i]) for i = 1:length(m)])
Flux.cpu(m::Chain) = Chain([cpu(m[i]) for i = 1:length(m)])

In [None]:
function device_free!(x)
    if !CUDA.functional()
        return
    end
    CUDA.unsafe_free!(x)
end
device_free!(x::Tuple) = device_free!.(x);

In [None]:
# custom flux layers
Join(combine, paths) = Parallel(combine, paths)
Join(combine, paths...) = Join(combine, paths);