From ff58001e50e6949035712d9bc78c55086f14b8f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 08:31:54 +0000 Subject: [PATCH 1/3] Initial plan From 1d5efdeda732854036c41af99c3df4165b44fdb2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:11:16 +0000 Subject: [PATCH 2/3] Implement gelu_sigmoid and update gelu_tanh to use tanh Co-authored-by: CarloLucibello <7014210+CarloLucibello@users.noreply.github.com> --- src/activations.jl | 42 +++++++++++++++++++++++++++++++++++++----- test/activations.jl | 9 ++++++--- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/activations.jl b/src/activations.jl index ba364699..cf998ec8 100644 --- a/src/activations.jl +++ b/src/activations.jl @@ -5,7 +5,7 @@ ACTIVATIONS = [ :σ, :hardσ, :hardtanh, :relu, - :leakyrelu, :relu6, :rrelu, :elu, :gelu_tanh, :gelu_erf, :swish, :hardswish, :selu, + :leakyrelu, :relu6, :rrelu, :elu, :gelu_tanh, :gelu_sigmoid, :gelu_erf, :swish, :hardswish, :selu, :celu, :softplus, :softsign, :logσ, :logcosh, :mish, :tanhshrink, :softshrink, :trelu, :lisht, :tanh_fast, :sigmoid_fast, @@ -305,6 +305,10 @@ deriv_elu(Ω, α=1) = ifelse(Ω ≥ 0, one(Ω), Ω + oftype(Ω, α)) Activation function from ["Gaussian Error Linear Units"](https://arxiv.org/abs/1606.08415) using tanh approximation. +This implementation uses `tanh` which allows for better pattern matching and fusion in optimizing +compilers compared to the sigmoid-based implementation. For a potentially faster implementation +that uses `sigmoid_fast`, see [`gelu_sigmoid`](@ref). + ```julia-repl julia> lineplot(gelu_tanh, -2, 2, height=7) ┌────────────────────────────────────────┐ @@ -337,16 +341,43 @@ julia> lineplot!(ans, swish) """ function gelu_tanh(x) α = oftf(x, 0.044715) - # λ = oftf(x, gelu_λ) - # x/2 * (1 + tanh(λ * (x + α * x^3))) # Standard implementation, for reference - λλ = oftf(x, gelu_2λ) - x * sigmoid_fast(λλ * x * muladd(x^2, α, one(x))) # This is faster & more accurate + λ = oftf(x, gelu_λ) + x/2 * (1 + tanh_fast(λ * (x + α * x^3))) end const gelu_λ = √(2 / π) const gelu_2λ = √(8 / π) function deriv_gelu_tanh(x) + α = oftf(x, 0.044715) + α2 = oftf(x, 0.08943) + λ = oftf(x, gelu_λ) + x2 = x * x + t = muladd(x2, α, one(x)) + z = λ * x * t + Ω = tanh_fast(z) + sech2 = 1 - Ω^2 + (1 + Ω)/2 + x * λ * muladd(x2, α2, t) * sech2 / 2 +end + +""" + gelu_sigmoid(x) = x * σ(√(8/π) * (x + 0.044715x^3)) + +Alternative implementation of the GELU activation function using `sigmoid` instead of `tanh`. +This is mathematically equivalent to [`gelu_tanh`](@ref) but may be faster in some cases. + +The sigmoid-based implementation may prevent pattern matching and fusion in some optimizing +compilers. Use [`gelu_tanh`](@ref) if you need better compiler optimization support. + +See ["Gaussian Error Linear Units"](https://arxiv.org/abs/1606.08415). +""" +function gelu_sigmoid(x) + α = oftf(x, 0.044715) + λλ = oftf(x, gelu_2λ) + x * sigmoid_fast(λλ * x * muladd(x^2, α, one(x))) +end + +function deriv_gelu_sigmoid(x) α = oftf(x, 0.044715) α2 = oftf(x, 0.08943) λλ = oftf(x, gelu_2λ) @@ -896,6 +927,7 @@ UNARY_ACTS = [ # f, dfdx # rrelu is random, can't write a rule. (:elu, :(deriv_elu(Ω))), (:gelu_tanh, :(deriv_gelu_tanh(x))), + (:gelu_sigmoid, :(deriv_gelu_sigmoid(x))), (:gelu_erf, :(deriv_gelu_erf(x))), (:swish, :(Ω + sigmoid_fast(x) * (1 - Ω))), (:hardswish, :(deriv_hardswish(x))), diff --git a/test/activations.jl b/test/activations.jl index 0bb5047c..2b2a6b90 100644 --- a/test/activations.jl +++ b/test/activations.jl @@ -13,6 +13,7 @@ BINARY_ACTIVATIONS = filter(f -> hasmethod(f, Tuple{Float64, Float64}), ACTIVATI @test elu(0.0) == 0.0 @test gelu(0.0) == 0.0 @test gelu_tanh(0.0) == 0.0 +@test gelu_sigmoid(0.0) == 0.0 @test gelu_erf(0.0) == 0.0 @test swish(0.0) == 0.0 @test hardswish(0.0) == 0.0 @@ -37,8 +38,9 @@ BINARY_ACTIVATIONS = filter(f -> hasmethod(f, Tuple{Float64, Float64}), ACTIVATI @test relu6(1.0) == 1.0 @test rrelu(1.0) == 1.0 @test elu(1.0) == 1.0 -@test gelu(1.0) == 0.8411919906082768 -@test gelu_tanh(1.0) == 0.8411919906082768 +@test gelu(1.0) ≈ 0.8411919906082768 +@test gelu_tanh(1.0) ≈ 0.8411919906082768 +@test gelu_sigmoid(1.0) ≈ 0.8411919906082768 @test gelu_erf(1.0) == 0.8413447460685429 @test swish(1.0) == sigmoid(1.0) @test hardswish(1.0) == hardsigmoid(1.0) @@ -63,6 +65,7 @@ BINARY_ACTIVATIONS = filter(f -> hasmethod(f, Tuple{Float64, Float64}), ACTIVATI @test elu(-1.0) == exp(-1.0) - 1.0 @test gelu(-1.0) ≈ -0.15880800939172324 @test gelu_tanh(-1.0) ≈ -0.15880800939172324 +@test gelu_sigmoid(-1.0) ≈ -0.15880800939172324 @test gelu_erf(-1.0) == -0.15865525393145707 @test swish(-1.0) == -sigmoid(-1.0) @test hardswish(-1.0) == -hardsigmoid(-1.0) @@ -120,7 +123,7 @@ end a == softsign && continue @test !isnan(a(Inf32)) - a in [gelu, gelu_tanh, gelu_erf, swish, hardswish, logcosh, mish] && continue + a in [gelu, gelu_tanh, gelu_sigmoid, gelu_erf, swish, hardswish, logcosh, mish] && continue @test !isnan(a(-Inf32)) end end From d05cfc57dfbc3207610061b7ea7bcf2bf4d95348 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:14:30 +0000 Subject: [PATCH 3/3] Add gelu_sigmoid to documentation Co-authored-by: CarloLucibello <7014210+CarloLucibello@users.noreply.github.com> --- docs/src/reference.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/reference.md b/docs/src/reference.md index 1b1f7827..b0663564 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -11,6 +11,7 @@ celu elu gelu gelu_tanh +gelu_sigmoid gelu_erf hardsigmoid sigmoid_fast