FluxML · CarloLucibello · Jun 14, 2023 · May 16, 2023 · Jun 10, 2023 · Jun 10, 2023
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -1,30 +1,22 @@
 steps:
-  - label: "GPU julia v1.6"
+  - label: "CUDA - Julia v1.9"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.6"
+          version: "1.9"
       - JuliaCI/julia-test#v1: ~
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
             - src
-#     commands:
-#       - julia --project=test -e """
-#         Pkg.develop(url = \"https://github.com/FluxML/NNlibCUDA.jl\")
-#         Pkg.instantiate()
-#         Pkg.build()
-#         Pkg.status()
-#         Pkg.test()
-#         Pkg.test(\"NNlibCUDA\")
-#         """
+            - ext
     agents:
       queue: "juliagpu"
       cuda: "*"
     env:
       NNLIB_TEST_CUDA: true
     timeout_in_minutes: 60
 
-  - label: "GPU julia v1"
+  - label: "CUDA - Julia v1"
     plugins:
       - JuliaCI/julia#v1:
           version: "1"
@@ -33,6 +25,7 @@ steps:
           codecov: true
           dirs:
             - src
+            - ext
     agents:
       queue: "juliagpu"
       cuda: "*"
@@ -55,10 +48,10 @@ steps:
     if: build.pull_request.labels includes "benchmark"
     timeout_in_minutes: 30
 
-  - label: "AMDGPU - Julia 1.9"
+  - label: "AMDGPU - Julia v1.9"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.9-nightly
+          version: "1.9"
       - JuliaCI/julia-test#v1:
       - JuliaCI/julia-coverage#v1:
           codecov: true

diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,6 @@ deps.jl
 *.log
 .vscode/
 /Manifest.toml
-lib/NNlibCUDA/Manifest.toml
 benchmark/Manifest.toml
 benchmark/*.json
 benchmark/report.md
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,12 @@
 name = "NNlib"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.8.20"
+version = "0.9.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -16,19 +17,25 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 NNlibAMDGPUExt = "AMDGPU"
+NNlibCUDAExt = "CUDA"
 
 [compat]
 AMDGPU = "0.4.8"
 Adapt = "2, 3.2"
 Atomix = "0.1"
 ChainRulesCore = "1.13"
+CUDA = "4"
+cuDNN = "1"
 GPUArraysCore = "0.1"
 KernelAbstractions = "0.9.2"
 Requires = "0.5, 1.0"
-julia = "1.6"
+julia = "1.9"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
diff --git a/README.md b/README.md
@@ -16,4 +16,4 @@ This package provides a library of functions useful for neural networks, such as
 
 For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl).
 
-To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well.
+GPU support is provided whenever the corresponding package (e.g. [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) or [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)) is loaded.
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -4,5 +4,4 @@
 
 For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl).
 
-To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well.
-For [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load it and NNlib in the same Julia session.
+To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) or [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load them and NNlib in the same Julia session.
diff --git a/ext/NNlibCUDA/.buildkite/pipeline.yml b/ext/NNlibCUDA/.buildkite/pipeline.yml
diff --git a/ext/NNlibCUDA/.github/workflows/compathelper.yml b/ext/NNlibCUDA/.github/workflows/compathelper.yml
diff --git a/ext/NNlibCUDA/.github/workflows/tagbot.yml b/ext/NNlibCUDA/.github/workflows/tagbot.yml
diff --git a/ext/NNlibCUDA/.gitignore b/ext/NNlibCUDA/.gitignore
diff --git a/ext/NNlibCUDA/LICENSE.md b/ext/NNlibCUDA/LICENSE.md
diff --git a/ext/NNlibCUDA/Project.toml b/ext/NNlibCUDA/Project.toml
diff --git a/ext/NNlibCUDA/README.md b/ext/NNlibCUDA/README.md
diff --git a/ext/NNlibCUDA/test/batchnorm.jl b/ext/NNlibCUDA/test/batchnorm.jl
diff --git a/ext/NNlibCUDA/src/NNlibCUDA.jl → ext/NNlibCUDAExt/NNlibCUDAExt.jl b/ext/NNlibCUDA/src/NNlibCUDA.jl → ext/NNlibCUDAExt/NNlibCUDAExt.jl
@@ -1,4 +1,4 @@
-module NNlibCUDA
+module NNlibCUDAExt
 
 using NNlib
 using CUDA, cuDNN

diff --git a/ext/NNlibCUDA/src/activations.jl → ext/NNlibCUDAExt/activations.jl b/ext/NNlibCUDA/src/activations.jl → ext/NNlibCUDAExt/activations.jl
diff --git a/ext/NNlibCUDA/src/batchedadjtrans.jl → ext/NNlibCUDAExt/batchedadjtrans.jl b/ext/NNlibCUDA/src/batchedadjtrans.jl → ext/NNlibCUDAExt/batchedadjtrans.jl
diff --git a/ext/NNlibCUDA/src/batchedmul.jl → ext/NNlibCUDAExt/batchedmul.jl b/ext/NNlibCUDA/src/batchedmul.jl → ext/NNlibCUDAExt/batchedmul.jl
diff --git a/ext/NNlibCUDA/src/ctc.jl → ext/NNlibCUDAExt/ctc.jl b/ext/NNlibCUDA/src/ctc.jl → ext/NNlibCUDAExt/ctc.jl
@@ -1,4 +1,4 @@
-# CTC loss moved from Flux.jl to NNlib + NNlibCUDA
+# CTC loss moved from Flux.jl to NNlib
 
 import NNlib: ctc_loss, ctc_alpha, ∇ctc_loss
 

diff --git a/ext/NNlibCUDA/src/cudnn/activations.jl → ext/NNlibCUDAExt/cudnn/activations.jl b/ext/NNlibCUDA/src/cudnn/activations.jl → ext/NNlibCUDAExt/cudnn/activations.jl
diff --git a/ext/NNlibCUDA/src/cudnn/batchnorm.jl → ext/NNlibCUDAExt/cudnn/batchnorm.jl b/ext/NNlibCUDA/src/cudnn/batchnorm.jl → ext/NNlibCUDAExt/cudnn/batchnorm.jl
diff --git a/ext/NNlibCUDA/src/cudnn/conv.jl → ext/NNlibCUDAExt/cudnn/conv.jl b/ext/NNlibCUDA/src/cudnn/conv.jl → ext/NNlibCUDAExt/cudnn/conv.jl
diff --git a/ext/NNlibCUDA/src/cudnn/cudnn.jl → ext/NNlibCUDAExt/cudnn/cudnn.jl b/ext/NNlibCUDA/src/cudnn/cudnn.jl → ext/NNlibCUDAExt/cudnn/cudnn.jl
diff --git a/ext/NNlibCUDA/src/cudnn/pooling.jl → ext/NNlibCUDAExt/cudnn/pooling.jl b/ext/NNlibCUDA/src/cudnn/pooling.jl → ext/NNlibCUDAExt/cudnn/pooling.jl
diff --git a/ext/NNlibCUDA/src/cudnn/softmax.jl → ext/NNlibCUDAExt/cudnn/softmax.jl b/ext/NNlibCUDA/src/cudnn/softmax.jl → ext/NNlibCUDAExt/cudnn/softmax.jl
diff --git a/ext/NNlibCUDA/src/fold.jl → ext/NNlibCUDAExt/fold.jl b/ext/NNlibCUDA/src/fold.jl → ext/NNlibCUDAExt/fold.jl
diff --git a/ext/NNlibCUDA/src/sampling.jl → ext/NNlibCUDAExt/sampling.jl b/ext/NNlibCUDA/src/sampling.jl → ext/NNlibCUDAExt/sampling.jl
diff --git a/ext/NNlibCUDA/src/scatter.jl → ext/NNlibCUDAExt/scatter.jl b/ext/NNlibCUDA/src/scatter.jl → ext/NNlibCUDAExt/scatter.jl
diff --git a/ext/NNlibCUDA/src/utils.jl → ext/NNlibCUDAExt/utils.jl b/ext/NNlibCUDA/src/utils.jl → ext/NNlibCUDAExt/utils.jl
diff --git a/src/activations.jl b/src/activations.jl
@@ -1,6 +1,6 @@
 ## Activation functions
 #
-# Some of activation functions have its wrapper function for GPU in NNlibCUDA.jl.
+# Some of activation functions have its wrapper function for GPU in NNlibCUDAExt.jl.
 # https://github.com/JuliaGPU/CuArrays.jl/issues/614
 
 ACTIVATIONS = [

diff --git a/src/ctc.jl b/src/ctc.jl
@@ -1,4 +1,4 @@
-# CTC loss moved from Flux.jl to NNlib + NNlibCUDA
+# CTC loss moved from Flux.jl to NNlib
 
 ## CPU implementation
 

diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -1,23 +1,3 @@
-
-### Deprecated while v0.7 was latest
-
-function ∇softmax(Δ, x; dims = 1)
-    # This 2-arg version recomputes the forward pass, which is slow.
-    # Removed from use in 0.7, but only prints a warning during 0.8:
-    Base.depwarn("`∇softmax(Δ, x)` without `y = softmax(x)` argument is deprecated, as this is inefficient, please use `∇softmax_data(dy, y)`", :∇softmax)
-    ∇softmax(Δ, x, softmax(x; dims); dims)
-end
-∇softmax!(Δ, x; dims = 1) = Δ .= ∇softmax(Δ, x; dims)
-∇softmax!(out, Δ, x; dims = 1) = out .= ∇softmax(Δ, x; dims)
-
-function ∇logsoftmax(Δ, x; dims = 1)
-    Base.depwarn("`∇logsoftmax(Δ, x)` without `y = logsoftmax(x)` argument is deprecated, please use `∇logsoftmax_data(dy, y)`", :∇logsoftmax)
-    ∇logsoftmax(Δ, x, logsoftmax(x; dims); dims)
-end
-∇logsoftmax!(Δ, x; dims = 1) = Δ .= ∇logsoftmax(Δ, x; dims)
-∇logsoftmax!(out, Δ, x; dims = 1) = out .= ∇logsoftmax(Δ, x; dims)
-
-
 ### Deprecated while v0.8 was latest
 
 export ∇softmax,

diff --git a/src/dropout.jl b/src/dropout.jl
@@ -158,5 +158,5 @@ _rng_from_array(::AbstractArray) = Random.default_rng()
 @non_differentiable _rng_from_array(::Any)
 
 # This exists because `rand!(default_rng(), CUDA.rand(3))` ignores the RNG,
-# and Flux would prefer an error. NNlibCUDA will overload it to produce that.
+# and Flux would prefer an error. NNlibCUDAExt will overload it to produce that.
 _rng_compat_array(::AbstractRNG, ::AbstractArray) = nothing
diff --git a/src/upsample.jl b/src/upsample.jl
@@ -380,15 +380,6 @@ function ∇upsample_linear_kernel!(
     return dx
 end
 
-# Compatibility layer for old versions of NNlibCUDA.
-# TODO Can be removed from NNlib 0.9.
-upsample_linear_wcn!(y, x) = upsample_linear_kernel!(y, x)
-upsample_bilinear_whcn!(y, x) = upsample_linear_kernel!(y, x)
-upsample_trilinear_whdcn!(y, x) = upsample_linear_kernel!(y, x)
-∇upsample_linear_wcn!(y, x) = ∇upsample_linear_kernel!(y, x)
-∇upsample_bilinear_whcn!(y, x) = ∇upsample_linear_kernel!(y, x)
-∇upsample_trilinear_whdcn!(y, x) = ∇upsample_linear_kernel!(y, x)
-
 # Linear (CPU): parallelization along channel x batch dimensions.
 
 @kernel function _upsample_linear_kernel!(::CPU, y::T, x::T, rwidth, align::Val{A}) where {

diff --git a/test/Project.toml b/test/Project.toml
@@ -1,13 +1,11 @@
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
-NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/ext/NNlibCUDA/test/activations.jl → test/ext_cuda/activations.jl b/ext/NNlibCUDA/test/activations.jl → test/ext_cuda/activations.jl
@@ -14,8 +14,8 @@ end
     end
 end
 
-# Broadcasting over complex CuArray works without NNlibCUDA, this test checks that
-# NNlibCUDA does not cause such operations to take a fast path which does not support
+# Broadcasting over complex CuArray works without NNlibCUDAExt, this test checks that
+# NNlibCUDAExt does not cause such operations to take a fast path which does not support
 # complex numbers (e.g. cuDNN)
 @testset "complex" begin
     f(x) = tanh.(x)

diff --git a/ext/NNlibCUDA/test/batchedadjtrans.jl → test/ext_cuda/batchedadjtrans.jl b/ext/NNlibCUDA/test/batchedadjtrans.jl → test/ext_cuda/batchedadjtrans.jl
diff --git a/ext/NNlibCUDA/test/batchedmul.jl → test/ext_cuda/batchedmul.jl b/ext/NNlibCUDA/test/batchedmul.jl → test/ext_cuda/batchedmul.jl
diff --git a/test/ext_cuda/batchnorm.jl b/test/ext_cuda/batchnorm.jl
@@ -0,0 +1,27 @@
+@testset "Batchnorm" begin
+    v = CUDA.rand(Float32, 2)
+    m = CUDA.rand(Float32, 2, 5)
+
+    @testset for training in (true, false), track_stats in (true, false)
+        kws = (training=training, track_stats=track_stats)
+
+        # Normal
+        batchnorm(v, v, m, v, v, 1.0; kws...)
+        ∇batchnorm(v, v, m, m, v, v, 1.0; kws...)
+
+        # No affine
+        batchnorm(nothing, nothing, m, v, v, 1.0; kws...)
+        ∇batchnorm(nothing, nothing, m, m, v, v, 1.0; kws...)
+
+        # No tracking
+        batchnorm(v, v, m, nothing, nothing, 1.0; kws...)
+        ∇batchnorm(v, v, m, m, nothing, nothing, 1.0; kws...)
+
+        # Both or neither tracked or affine params must be set
+        for (α, β) in ((v, nothing), (nothing, v))
+            @test_throws MethodError batchnorm(α, β, m, v, v, 1.0; kws...)
+            @test_throws MethodError ∇batchnorm(α, β, m, m, v, v, 1.0; kws...)
+            @test_throws ArgumentError batchnorm(v, v, m, α, β, 1.0; kws...)
+        end
+    end 
+end
diff --git a/ext/NNlibCUDA/test/conv.jl → test/ext_cuda/conv.jl b/ext/NNlibCUDA/test/conv.jl → test/ext_cuda/conv.jl