From e1a4bbc8b5f898fafb361bb24f35bf0ee8f8cc7c Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 10 Jun 2021 08:08:46 +0200
Subject: [PATCH 1/6] fix AdamW; improve WeightDecay docstring

---
 src/optimise/optimisers.jl | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index e67f7e49cd..3b418fad92 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -491,7 +491,7 @@ opt = ADAMW(0.001, (0.89, 0.995), 0.1)
 ```
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
-  Optimiser(ADAM(η, β), WeightDecay(decay))
+  Optimiser(ADAM(1, β), WeightDecay(decay), Descent(η))
 
 """
     AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999))
@@ -627,12 +627,18 @@ function apply!(o::ExpDecay, x, Δ)
 end
 
 """
-    WeightDecay(wd = 0)
+    WeightDecay(λ = 0)
 
-Decay weights by `wd`.
+Decay weights by ``λ``. 
+Tipically composed  with other optimizers as the first transformation to the gradient,
+making it equivalent to adding ``L_2`` regularization 
+with coefficient  ``λ`` to the loss.
 
-# Parameters
-- Weight decay (`wd`)
+# Examples
+
+```julia
+opt = Optimiser(WeigthDecay(1f-4), ADAM())
+```
 """
 mutable struct WeightDecay <: AbstractOptimiser
   wd::Real

From 5f314ac255699def42c5b6ed300850a897a3f8ae Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 10 Jun 2021 08:17:34 +0200
Subject: [PATCH 2/6] improve InvDecays docstring

---
 src/optimise/optimisers.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 3b418fad92..57ece31ffe 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -564,9 +564,15 @@ Apply inverse time decay to an optimiser, so that the effective step size at
 iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
 The wrapped optimiser's step size is not modified.
 
+See also the [Scheduling Optimisers](@ref) section of the docs
+for more general scheduling techniques.
+
 # Examples
+
 ```julia
-Optimiser(InvDecay(..), Opt(..))
+# Inverse decay of the learning rate
+# with starting value 0.001 and decay coefficient 0.01.
+opt = Optimiser(Adam(1f-3), InvDecay(1f-2))
 ```
 """
 mutable struct InvDecay <: AbstractOptimiser

From 9f1966faf0d9fb28881b78ade63e1e2cad08075e Mon Sep 17 00:00:00 2001
From: CarloLucibello <carlo.lucibello@gmail.com>
Date: Thu, 10 Jun 2021 08:26:27 +0200
Subject: [PATCH 3/6] improve ExpDecays docstring

---
 src/optimise/optimisers.jl | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 57ece31ffe..cc3ddb0e4b 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -569,6 +569,9 @@ for more general scheduling techniques.
 
 # Examples
 
+`InvDecay` is tipically composed  with other optimizers 
+as the last transformation of the gradient:
+
 ```julia
 # Inverse decay of the learning rate
 # with starting value 0.001 and decay coefficient 0.01.
@@ -604,12 +607,16 @@ a minimum of `clip`.
                 two decay operations.
 - `clip`: Minimum value of learning rate.
 
+
+See also the [Scheduling Optimisers](@ref) section of the docs
+for more general scheduling techniques.
+
 # Examples
-To apply exponential decay to an optimiser:
-```julia
-Optimiser(ExpDecay(..), Opt(..))
 
-opt = Optimiser(ExpDecay(), ADAM())
+`ExpDecay` is tipically composed  with other optimizers 
+as the last transformation of the gradient:
+```julia
+opt = Optimiser(ADAM(), ExpDecay())
 ```
 """
 mutable struct ExpDecay <: AbstractOptimiser
@@ -620,7 +627,8 @@ mutable struct ExpDecay <: AbstractOptimiser
   current::IdDict
 end
 
-ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) = ExpDecay(opt, decay, decay_step, clip, IdDict())
+ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4) = 
+  ExpDecay(opt, decay, decay_step, clip, IdDict())
 
 function apply!(o::ExpDecay, x, Δ)
   η, s, decay = o.eta, o.step, o.decay

From 564ab29cc081a248b9a8db532a679ebfc8739342 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@unibocconi.it>
Date: Thu, 10 Jun 2021 19:30:08 +0200
Subject: [PATCH 4/6] Update src/optimise/optimisers.jl

Co-authored-by: Dhairya Gandhi <dhairya@juliacomputing.com>
---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index cc3ddb0e4b..e854e10537 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -644,7 +644,7 @@ end
     WeightDecay(λ = 0)
 
 Decay weights by ``λ``. 
-Tipically composed  with other optimizers as the first transformation to the gradient,
+Typically composed  with other optimizers as the first transformation to the gradient,
 making it equivalent to adding ``L_2`` regularization 
 with coefficient  ``λ`` to the loss.
 

From b9c94f59cf9787c326a54d6f71994f70e5c34ff0 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@unibocconi.it>
Date: Thu, 10 Jun 2021 19:30:14 +0200
Subject: [PATCH 5/6] Update src/optimise/optimisers.jl

Co-authored-by: Dhairya Gandhi <dhairya@juliacomputing.com>
---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index e854e10537..07fdd5a374 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -569,7 +569,7 @@ for more general scheduling techniques.
 
 # Examples
 
-`InvDecay` is tipically composed  with other optimizers 
+`InvDecay` is typically composed  with other optimizers 
 as the last transformation of the gradient:
 
 ```julia

From 380ca7606f57abbf64ecce8c789858b0d7f12d39 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@unibocconi.it>
Date: Thu, 10 Jun 2021 19:30:25 +0200
Subject: [PATCH 6/6] Update src/optimise/optimisers.jl

Co-authored-by: Dhairya Gandhi <dhairya@juliacomputing.com>
---
 src/optimise/optimisers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 07fdd5a374..2fde36143a 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -613,7 +613,7 @@ for more general scheduling techniques.
 
 # Examples
 
-`ExpDecay` is tipically composed  with other optimizers 
+`ExpDecay` is typically composed  with other optimizers 
 as the last transformation of the gradient:
 ```julia
 opt = Optimiser(ADAM(), ExpDecay())