This repository was archived by the owner on Jul 16, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 149
/
Copy pathgrad_desc.rs
476 lines (425 loc) · 14.1 KB
/
grad_desc.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
//! Gradient Descent
//!
//! Implementation of gradient descent algorithm. Module contains
//! the struct `GradientDesc` which is instantiated within models
//! implementing the Optimizable trait.
//!
//! Currently standard batch gradient descent is the only implemented
//! optimization algorithm but there is flexibility to introduce new
//! algorithms and git them into the same scheme easily.
use learning::optim::{Optimizable, OptimAlgorithm};
use linalg::Vector;
use linalg::{Matrix, BaseMatrix};
use rulinalg::utils;
use learning::toolkit::rand_utils;
const LEARNING_EPS: f64 = 1e-20;
/// Batch Gradient Descent algorithm
#[derive(Clone, Copy, Debug)]
pub struct GradientDesc {
/// The step-size for the gradient descent steps.
alpha: f64,
/// The number of iterations to run.
iters: usize,
}
/// The default gradient descent algorithm.
///
/// The defaults are:
///
/// - alpha = 0.3
/// - iters = 100
impl Default for GradientDesc {
fn default() -> GradientDesc {
GradientDesc {
alpha: 0.3,
iters: 100,
}
}
}
impl GradientDesc {
/// Construct a gradient descent algorithm.
///
/// Requires the step size and iteration count
/// to be specified.
///
/// # Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::GradientDesc;
///
/// let gd = GradientDesc::new(0.3, 10000);
/// ```
pub fn new(alpha: f64, iters: usize) -> GradientDesc {
assert!(alpha > 0f64,
"The step size (alpha) must be greater than 0.");
GradientDesc {
alpha: alpha,
iters: iters,
}
}
}
impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Create the initial optimal parameters
let mut optimizing_val = Vector::new(start.to_vec());
// The cost at the start of each iteration
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
// Compute the cost and gradient for the current parameters
let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);
// Early stopping
if (start_iter_cost - cost).abs() < LEARNING_EPS {
break;
} else {
// Update the optimal parameters using gradient descent
optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
// Update the latest cost
start_iter_cost = cost;
}
}
optimizing_val.into_vec()
}
}
/// Stochastic Gradient Descent algorithm.
///
/// Uses basic momentum to control the learning rate.
#[derive(Clone, Copy, Debug)]
pub struct StochasticGD {
/// Controls the momentum of the descent
alpha: f64,
/// The square root of the raw learning rate.
mu: f64,
/// The number of passes through the data.
iters: usize,
}
/// The default Stochastic GD algorithm.
///
/// The defaults are:
///
/// - alpha = 0.1
/// - mu = 0.1
/// - iters = 20
impl Default for StochasticGD {
fn default() -> StochasticGD {
StochasticGD {
alpha: 0.1,
mu: 0.1,
iters: 20,
}
}
}
impl StochasticGD {
/// Construct a stochastic gradient descent algorithm.
///
/// Requires the learning rate, momentum rate and iteration count
/// to be specified.
///
/// With Nesterov momentum by default.
///
/// # Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::StochasticGD;
///
/// let sgd = StochasticGD::new(0.1, 0.3, 5);
/// ```
pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
assert!(mu > 0f64, "The step size (mu) must be greater than 0.");
StochasticGD {
alpha: alpha,
mu: mu,
iters: iters,
}
}
}
impl<M> OptimAlgorithm<M> for StochasticGD
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
{
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Create the initial optimal parameters
let mut optimizing_val = Vector::new(start.to_vec());
// Create the momentum based gradient distance
let mut delta_w = Vector::zeros(start.len());
// Set up the indices for permutation
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
// The cost at the start of each iteration
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
// The cost at the end of each stochastic gd pass
let mut end_cost = 0f64;
// Permute the indices
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
// Compute the cost and gradient for this data pair
let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
// Backup previous velocity
let prev_w = delta_w.clone();
// Compute the difference in gradient using Nesterov momentum
delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
// Update the parameters
optimizing_val = &optimizing_val -
(&prev_w * (-self.alpha) + &delta_w * (1. + self.alpha));
// Set the end cost (this is only used after the last iteration)
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
// Early stopping
if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
// Update the cost
start_iter_cost = end_cost;
}
}
optimizing_val.into_vec()
}
}
/// Adaptive Gradient Descent
///
/// The adaptive gradient descent algorithm (Duchi et al. 2010).
#[derive(Debug)]
pub struct AdaGrad {
alpha: f64,
tau: f64,
iters: usize,
}
impl AdaGrad {
/// Constructs a new AdaGrad algorithm.
///
/// # Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::AdaGrad;
///
/// // Create a new AdaGrad algorithm with step size 0.5
/// // and adaptive scaling constant 1.0
/// let gd = AdaGrad::new(0.5, 1.0, 100);
/// ```
pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
assert!(alpha > 0f64,
"The step size (alpha) must be greater than 0.");
assert!(tau >= 0f64,
"The adaptive constant (tau) cannot be negative.");
AdaGrad {
alpha: alpha,
tau: tau,
iters: iters,
}
}
}
impl Default for AdaGrad {
fn default() -> AdaGrad {
AdaGrad {
alpha: 1f64,
tau: 3f64,
iters: 100,
}
}
}
impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Initialize the adaptive scaling
let mut ada_s = Vector::zeros(start.len());
// Initialize the optimal parameters
let mut optimizing_val = Vector::new(start.to_vec());
// Set up the indices for permutation
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
// The cost at the start of each iteration
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
// The cost at the end of each stochastic gd pass
let mut end_cost = 0f64;
// Permute the indices
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
// Compute the cost and gradient for this data pair
let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
// Update the adaptive scaling by adding the gradient squared
utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y);
// Compute the change in gradient
utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| {
*x = self.alpha * (*x / (self.tau + (y).sqrt()))
});
// Update the parameters
optimizing_val = &optimizing_val - Vector::new(vec_data);
// Set the end cost (this is only used after the last iteration)
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
// Early stopping
if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
// Update the cost
start_iter_cost = end_cost;
}
}
optimizing_val.into_vec()
}
}
/// RMSProp
///
/// The RMSProp algorithm (Hinton et al. 2012).
#[derive(Debug, Clone, Copy)]
pub struct RMSProp {
/// The base step size of gradient descent steps
learning_rate: f64,
/// Rate at which running total of average square gradients decays
decay_rate: f64,
/// Small value used to avoid divide by zero
epsilon: f64,
/// The number of passes through the data
iters: usize,
}
/// The default RMSProp configuration
///
/// The defaults are:
///
/// - learning_rate = 0.01
/// - decay_rate = 0.9
/// - epsilon = 1.0e-5
/// - iters = 50
impl Default for RMSProp {
fn default() -> RMSProp {
RMSProp {
learning_rate: 0.01,
decay_rate: 0.9,
epsilon: 1.0e-5,
iters: 50
}
}
}
impl RMSProp {
/// Construct an RMSProp algorithm.
///
/// Requires learning rate, decay rate, epsilon, and iteration count.
///
/// #Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::RMSProp;
///
/// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20);
/// ```
pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
assert!(0f64 < learning_rate, "The learning rate must be positive");
assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
assert!(0f64 < epsilon, "Epsilon must be positive");
RMSProp {
decay_rate: decay_rate,
learning_rate: learning_rate,
epsilon: epsilon,
iters: iters
}
}
}
impl<M> OptimAlgorithm<M> for RMSProp
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Initial parameters
let mut params = Vector::new(start.to_vec());
// Running average of squared gradients
let mut rmsprop_cache = Vector::zeros(start.len());
// Set up indices for permutation
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
// The cost from the previous iteration
let mut prev_cost = 0f64;
for _ in 0..self.iters {
// The cost at end of each pass
let mut end_cost = 0f64;
// Permute the vertices
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
let (cost, grad) = model.compute_grad(params.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
let mut grad = Vector::new(grad);
let grad_squared = grad.clone().apply(&|x| x*x);
// Update cached average of squared gradients
rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
// RMSProp update rule
utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
*x = *x * self.learning_rate / (y + self.epsilon).sqrt();
});
params = ¶ms - &grad;
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
// Early stopping
if (prev_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
prev_cost = end_cost;
}
}
params.into_vec()
}
}
#[cfg(test)]
mod tests {
use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
#[test]
#[should_panic]
fn gd_neg_stepsize() {
let _ = GradientDesc::new(-0.5, 0);
}
#[test]
#[should_panic]
fn stochastic_gd_neg_momentum() {
let _ = StochasticGD::new(-0.5, 1f64, 0);
}
#[test]
#[should_panic]
fn stochastic_gd_neg_stepsize() {
let _ = StochasticGD::new(0.5, -1f64, 0);
}
#[test]
#[should_panic]
fn adagrad_neg_stepsize() {
let _ = AdaGrad::new(-0.5, 1f64, 0);
}
#[test]
#[should_panic]
fn adagrad_neg_adaptive_scale() {
let _ = AdaGrad::new(0.5, -1f64, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_decay_rate() {
let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_epsilon() {
let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_learning_rate() {
let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
}
}