-
Notifications
You must be signed in to change notification settings - Fork 26
/
lmcma.py
266 lines (244 loc) · 14.5 KB
/
lmcma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import numpy as np # engine for numerical computing
from pypop7.optimizers.es.es import ES # abstract class of all Evolution Strategies (ES) classes
class LMCMA(ES):
"""Limited-Memory Covariance Matrix Adaptation (LMCMA).
.. note:: Currently `LMCMA` is one of **State-Of-The-Art (SOTA)** variants of `CMA-ES` designed especially for
large-scale black-box optimization. Inspired by a well-established gradient-based optimizer called `L-BFGS
<https://link.springer.com/article/10.1007/BF01589116>`_, it stores only *m* direction vectors to reconstruct
the covariance matirx on-the-fly, resulting in **O(mn)** time complexity w.r.t. each sampling, where often
*m=O(log(n))* and *n* is the dimensionality of objective function.
Parameters
----------
problem : `dict`
problem arguments with the following common settings (`keys`):
* 'fitness_function' - objective function to be **minimized** (`func`),
* 'ndim_problem' - number of dimensionality (`int`),
* 'upper_boundary' - upper boundary of search range (`array_like`),
* 'lower_boundary' - lower boundary of search range (`array_like`).
options : `dict`
optimizer options with the following common settings (`keys`):
* 'max_function_evaluations' - maximum of function evaluations (`int`, default: `np.inf`),
* 'max_runtime' - maximal runtime to be allowed (`float`, default: `np.inf`),
* 'seed_rng' - seed for random number generation needed to be *explicitly* set (`int`);
and with the following particular settings (`keys`):
* 'sigma' - initial global step-size, aka mutation strength (`float`),
* 'mean' - initial (starting) point, aka mean of Gaussian search distribution (`array_like`),
* if not given, it will draw a random sample from the uniform distribution whose search range is
bounded by `problem['lower_boundary']` and `problem['upper_boundary']`.
* 'm' - number of direction vectors (`int`, default:
`4 + int(3*np.log(problem['ndim_problem']))`),
* 'base_m' - base number of direction vectors (`int`, default: `4`),
* 'period' - update period (`int`, default:
`int(np.maximum(1, np.log(problem['ndim_problem'])))`),
* 'n_steps' - target number of generations between vectors (`int`, default:
`problem['ndim_problem']`),
* 'c_c' - learning rate for evolution path update (`float`, default:
`0.5/np.sqrt(problem['ndim_problem'])`),
* 'c_1' - learning rate for covariance matrix adaptation (`float`, default:
`1.0/(10.0*np.log(problem['ndim_problem'] + 1.0))`),
* 'c_s' - learning rate for population success rule (`float`, default: `0.3`),
* 'd_s' - changing rate for population success rule (`float`, default: `1.0`),
* 'z_star' - target success rate for population success rule (`float`, default: `0.3`),
* 'n_individuals' - number of offspring, aka offspring population size (`int`, default:
`4 + int(3*np.log(problem['ndim_problem']))`),
* 'n_parents' - number of parents, aka parental population size (`int`, default:
`int(options['n_individuals']/2)`).
Examples
--------
Use the black-box optimizer `LMCMA` to minimize the well-known test function
`Rosenbrock <http://en.wikipedia.org/wiki/Rosenbrock_function>`_:
.. code-block:: python
:linenos:
>>> import numpy # engine for numerical computing
>>> from pypop7.benchmarks.base_functions import rosenbrock # function to be minimized
>>> from pypop7.optimizers.es.lmcma import LMCMA
>>> problem = {'fitness_function': rosenbrock, # to define problem arguments
... 'ndim_problem': 1000,
... 'lower_boundary': -5.0*numpy.ones((1000,)),
... 'upper_boundary': 5.0*numpy.ones((1000,))}
>>> options = {'max_function_evaluations': 1e5*1000, # to set optimizer options
... 'fitness_threshold': 1e-8,
... 'seed_rng': 2022,
... 'mean': 3.0*numpy.ones((1000,)),
... 'sigma': 3.0} # global step-size may need to be tuned for optimality
>>> lmcma = LMCMA(problem, options) # to initialize the optimizer class
>>> results = lmcma.optimize() # to run the optimization/evolution process
>>> print(f"LMCMA: {results['n_function_evaluations']}, {results['best_so_far_y']}")
LMCMA: 2186953, 9.9927e-09
For its correctness checking of Python coding, please refer to `this code-based repeatability report
<https://github.com/Evolutionary-Intelligence/pypop/blob/main/pypop7/optimizers/es/_repeat_lmcma.py>`_
for all details. For *pytest*-based automatic testing, please see `test_lmcma.py
<https://github.com/Evolutionary-Intelligence/pypop/blob/main/pypop7/optimizers/es/test_lmcma.py>`_.
Attributes
----------
base_m : `int`
base number of direction vectors.
c_c : `float`
learning rate for evolution path update.
c_s : `float`
learning rate for population success rule.
c_1 : `float`
learning rate for covariance matrix adaptation.
d_s : `float`
changing rate for population success rule.
m : `int`
number of direction vectors.
mean : `array_like`
initial (starting) point, aka mean of Gaussian search distribution.
n_individuals : `int`
number of offspring, aka offspring population size.
n_parents : `int`
number of parents, aka parental population size.
n_steps : `int`
target number of generations between vectors.
period : `int`
update period.
sigma : `float`
final global step-size, aka mutation strength.
z_star : `float`
target success rate for population success rule.
References
----------
Loshchilov, I., 2017.
`LM-CMA: An alternative to L-BFGS for large-scale black box optimization.
<https://direct.mit.edu/evco/article-abstract/25/1/143/1041/LM-CMA-An-Alternative-to-L-BFGS-for-Large-Scale>`_
Evolutionary Computation, 25(1), pp.143-171.
Please refer to the *official* C++ version from Loshchilov, which also provides an interface for Matlab:
https://sites.google.com/site/ecjlmcma/
(Unfortunately, this online link appears to be not openly available now.)
"""
def __init__(self, problem, options):
ES.__init__(self, problem, options)
self.m = options.get('m', 4 + int(3*np.log(self.ndim_problem))) # number of direction vectors
self.base_m = options.get('base_m', 4) # base number of direction vectors
self.period = options.get('period', int(np.maximum(1, np.log(self.ndim_problem)))) # update period
self.n_steps = options.get('n_steps', self.ndim_problem) # target number of generations between vectors
self.c_c = options.get('c_c', 0.5/np.sqrt(self.ndim_problem)) # learning rate for evolution path
self.c_1 = options.get('c_1', 1.0/(10.0*np.log(self.ndim_problem + 1.0)))
self.c_s = options.get('c_s', 0.3) # learning rate for population success rule (PSR)
self.d_s = options.get('d_s', 1.0) # changing rate for PSR
self.z_star = options.get('z_star', 0.3) # target success rate for PSR
self._a = np.sqrt(1.0 - self.c_1)
self._c = 1.0/np.sqrt(1.0 - self.c_1)
self._bd_1 = np.sqrt(1.0 - self.c_1)
self._bd_2 = self.c_1/(1.0 - self.c_1)
self._p_c_1 = 1.0 - self.c_c
self._p_c_2 = None
self._j = None
self._l = None
self._it = None
self._rr = None # for PSR
def initialize(self, is_restart=False):
mean = self._initialize_mean(is_restart) # mean of Gaussian search distribution
x = np.empty((self.n_individuals, self.ndim_problem)) # offspring population
p_c = np.zeros((self.ndim_problem,)) # evolution path
s = 0.0 # for PSR
vm = np.empty((self.m, self.ndim_problem))
pm = np.empty((self.m, self.ndim_problem))
b = np.empty((self.m,))
d = np.empty((self.m,))
y = np.empty((self.n_individuals,)) # fitness (no evaluation)
self._p_c_2 = np.sqrt(self.c_c*(2.0 - self.c_c)*self._mu_eff)
self._j = [None]*self.m
self._l = [None]*self.m
self._it = 0
self._rr = np.arange(self.n_individuals*2, 0, -1) - 1
return mean, x, p_c, s, vm, pm, b, d, y
def _rademacher(self):
"""Sampling from Rademacher distribution."""
random = self.rng_optimization.integers(2, size=(self.ndim_problem,))
random[random == 0] = -1
return np.double(random)
def _a_z(self, z=None, pm=None, vm=None, b=None, start=None, it=None):
"""Az(): Cholesky factor-vector update."""
x = np.copy(z)
for t in range(start, it):
x = self._a*x + b[self._j[t]]*np.dot(vm[self._j[t]], z)*pm[self._j[t]]
return x
def iterate(self, mean=None, x=None, pm=None, vm=None, y=None, b=None, args=None):
sign, a_z = 1, np.empty((self.ndim_problem,)) # for mirrored sampling
for k in range(self.n_individuals):
if self._check_terminations():
return x, y
if sign == 1: # SelectSubst(): direction vectors selection
base_m = (10.0*self.base_m if k == 0 else self.base_m)*np.abs(
self.rng_optimization.standard_normal())
base_m = float(self._it if base_m > self._it else base_m)
a_z = self._a_z(self._rademacher(), pm, vm, b,
int(self._it - base_m) if self._it > 1 else 0, self._it)
x[k] = mean + sign*self.sigma*a_z
y[k] = self._evaluate_fitness(x[k], args)
sign *= -1 # sampling in the opposite direction for mirrored sampling
return x, y
def _a_inv_z(self, v=None, vm=None, d=None, i=None):
"""Ainvz(): inverse Cholesky factor-vector update."""
x = np.copy(v)
for t in range(0, i):
x = self._c*x - d[self._j[t]]*np.dot(vm[self._j[t]], x)*vm[self._j[t]]
return x
def update_distribution(self, mean=None, x=None, p_c=None, s=None, vm=None,
pm=None, b=None, d=None, y=None, y_bak=None):
mean_bak = np.dot(self._w, x[np.argsort(y)[:self.n_parents]])
p_c = self._p_c_1*p_c + self._p_c_2*(mean_bak - mean)/self.sigma
# select and store direction vectors - to preserve a certain temporal distance in terms of
# number of generations between the stored direction vectors
if self._n_generations % self.period == 0: # temporal distance
_n_generations = int(self._n_generations/self.period) # temporal distance
i_min = 1 # index of the first vector that will be replaced by the new one
# the higher is the index of `self._j`, the more recent is the corresponding direction vector
if _n_generations < self.m:
self._j[_n_generations] = _n_generations
else:
if self.m > 1:
# find a pair of consecutively saved vectors with the distance between them
# closest to a target distance
d_min = (self._l[self._j[1]] - self._l[self._j[0]]) - self.n_steps
for j in range(2, self.m):
d_cur = (self._l[self._j[j]] - self._l[self._j[j - 1]]) - self.n_steps
if d_cur < d_min:
d_min, i_min = d_cur, j
i_min = 0 if d_min >= 0 else i_min
updated = self._j[i_min]
for j in range(i_min, self.m - 1):
self._j[j] = self._j[j + 1]
self._j[self.m - 1] = updated
self._it = int(np.minimum(self.m, _n_generations + 1))
self._l[self._j[self._it - 1]] = _n_generations*self.period
pm[self._j[self._it - 1]] = p_c
for i in range(0 if i_min == 1 else i_min, self._it):
vm[self._j[i]] = self._a_inv_z(pm[self._j[i]], vm, d, i)
v_n = np.dot(vm[self._j[i]], vm[self._j[i]])
bd_3 = np.sqrt(1.0 + self._bd_2*v_n)
b[self._j[i]] = self._a/v_n*(bd_3 - 1.0)
d[self._j[i]] = self._c/v_n*(1.0 - 1.0/bd_3)
if self._n_generations > 0: # for PSR
r = np.argsort(np.hstack((y, y_bak)))
z_psr = np.sum(self._rr[r < self.n_individuals] - self._rr[r >= self.n_individuals])
z_psr = z_psr/np.power(self.n_individuals, 2) - self.z_star
s = (1.0 - self.c_s)*s + self.c_s*z_psr
self.sigma *= np.exp(s/self.d_s)
return mean_bak, p_c, s, vm, pm, b, d
def restart_reinitialize(self, mean=None, x=None, p_c=None, s=None, vm=None,
pm=None, b=None, d=None, y=None):
if self.is_restart and ES.restart_reinitialize(self, y):
mean, x, p_c, s, vm, pm, b, d, y = self.initialize(True)
return mean, x, p_c, s, vm, pm, b, d, y
def optimize(self, fitness_function=None, args=None): # for all generations (iterations)
fitness = ES.optimize(self, fitness_function)
mean, x, p_c, s, vm, pm, b, d, y = self.initialize()
while not self.termination_signal:
y_bak = np.copy(y)
# sample and evaluate offspring population
x, y = self.iterate(mean, x, pm, vm, y, b, args)
if self._check_terminations():
break
mean, p_c, s, vm, pm, b, d = self.update_distribution(
mean, x, p_c, s, vm, pm, b, d, y, y_bak)
self._print_verbose_info(fitness, y)
self._n_generations += 1
mean, x, p_c, s, vm, pm, b, d, y = self.restart_reinitialize(
mean, x, p_c, s, vm, pm, b, d, y)
results = self._collect(fitness, y, mean)
results['p_c'] = p_c
results['s'] = s
return results