From e1a4bbc8b5f898fafb361bb24f35bf0ee8f8cc7c Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Thu, 10 Jun 2021 08:08:46 +0200 Subject: [PATCH 1/6] fix AdamW; improve WeightDecay docstring --- src/optimise/optimisers.jl | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index e67f7e49cd..3b418fad92 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -491,7 +491,7 @@ opt = ADAMW(0.001, (0.89, 0.995), 0.1) ``` """ ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) = - Optimiser(ADAM(η, β), WeightDecay(decay)) + Optimiser(ADAM(1, β), WeightDecay(decay), Descent(η)) """ AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999)) @@ -627,12 +627,18 @@ function apply!(o::ExpDecay, x, Δ) end """ - WeightDecay(wd = 0) + WeightDecay(λ = 0) -Decay weights by `wd`. +Decay weights by ``λ``. +Tipically composed with other optimizers as the first transformation to the gradient, +making it equivalent to adding ``L_2`` regularization +with coefficient ``λ`` to the loss. -# Parameters -- Weight decay (`wd`) +# Examples + +```julia +opt = Optimiser(WeigthDecay(1f-4), ADAM()) +``` """ mutable struct WeightDecay <: AbstractOptimiser wd::Real From 5f314ac255699def42c5b6ed300850a897a3f8ae Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Thu, 10 Jun 2021 08:17:34 +0200 Subject: [PATCH 2/6] improve InvDecays docstring --- src/optimise/optimisers.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 3b418fad92..57ece31ffe 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -564,9 +564,15 @@ Apply inverse time decay to an optimiser, so that the effective step size at iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size. The wrapped optimiser's step size is not modified. +See also the [Scheduling Optimisers](@ref) section of the docs +for more general scheduling techniques. + # Examples + ```julia -Optimiser(InvDecay(..), Opt(..)) +# Inverse decay of the learning rate +# with starting value 0.001 and decay coefficient 0.01. +opt = Optimiser(Adam(1f-3), InvDecay(1f-2)) ``` """ mutable struct InvDecay <: AbstractOptimiser From 9f1966faf0d9fb28881b78ade63e1e2cad08075e Mon Sep 17 00:00:00 2001 From: CarloLucibello Date: Thu, 10 Jun 2021 08:26:27 +0200 Subject: [PATCH 3/6] improve ExpDecays docstring --- src/optimise/optimisers.jl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 57ece31ffe..cc3ddb0e4b 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -569,6 +569,9 @@ for more general scheduling techniques. # Examples +`InvDecay` is tipically composed with other optimizers +as the last transformation of the gradient: + ```julia # Inverse decay of the learning rate # with starting value 0.001 and decay coefficient 0.01. @@ -604,12 +607,16 @@ a minimum of `clip`. two decay operations. - `clip`: Minimum value of learning rate. + +See also the [Scheduling Optimisers](@ref) section of the docs +for more general scheduling techniques. + # Examples -To apply exponential decay to an optimiser: -```julia -Optimiser(ExpDecay(..), Opt(..)) -opt = Optimiser(ExpDecay(), ADAM()) +`ExpDecay` is tipically composed with other optimizers +as the last transformation of the gradient: +```julia +opt = Optimiser(ADAM(), ExpDecay()) ``` """ mutable struct ExpDecay <: AbstractOptimiser @@ -620,7 +627,8 @@ mutable struct ExpDecay <: AbstractOptimiser current::IdDict end -ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) = ExpDecay(opt, decay, decay_step, clip, IdDict()) +ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) = + ExpDecay(opt, decay, decay_step, clip, IdDict()) function apply!(o::ExpDecay, x, Δ) η, s, decay = o.eta, o.step, o.decay From 564ab29cc081a248b9a8db532a679ebfc8739342 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 10 Jun 2021 19:30:08 +0200 Subject: [PATCH 4/6] Update src/optimise/optimisers.jl Co-authored-by: Dhairya Gandhi --- src/optimise/optimisers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index cc3ddb0e4b..e854e10537 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -644,7 +644,7 @@ end WeightDecay(λ = 0) Decay weights by ``λ``. -Tipically composed with other optimizers as the first transformation to the gradient, +Typically composed with other optimizers as the first transformation to the gradient, making it equivalent to adding ``L_2`` regularization with coefficient ``λ`` to the loss. From b9c94f59cf9787c326a54d6f71994f70e5c34ff0 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 10 Jun 2021 19:30:14 +0200 Subject: [PATCH 5/6] Update src/optimise/optimisers.jl Co-authored-by: Dhairya Gandhi --- src/optimise/optimisers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index e854e10537..07fdd5a374 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -569,7 +569,7 @@ for more general scheduling techniques. # Examples -`InvDecay` is tipically composed with other optimizers +`InvDecay` is typically composed with other optimizers as the last transformation of the gradient: ```julia From 380ca7606f57abbf64ecce8c789858b0d7f12d39 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 10 Jun 2021 19:30:25 +0200 Subject: [PATCH 6/6] Update src/optimise/optimisers.jl Co-authored-by: Dhairya Gandhi --- src/optimise/optimisers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index 07fdd5a374..2fde36143a 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -613,7 +613,7 @@ for more general scheduling techniques. # Examples -`ExpDecay` is tipically composed with other optimizers +`ExpDecay` is typically composed with other optimizers as the last transformation of the gradient: ```julia opt = Optimiser(ADAM(), ExpDecay())