This repository has been archived by the owner on Jul 16, 2021. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
rusty-machine/src/learning/optim/grad_desc.rs
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
476 lines (425 sloc)
14.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//! Gradient Descent | |
//! | |
//! Implementation of gradient descent algorithm. Module contains | |
//! the struct `GradientDesc` which is instantiated within models | |
//! implementing the Optimizable trait. | |
//! | |
//! Currently standard batch gradient descent is the only implemented | |
//! optimization algorithm but there is flexibility to introduce new | |
//! algorithms and git them into the same scheme easily. | |
use learning::optim::{Optimizable, OptimAlgorithm}; | |
use linalg::Vector; | |
use linalg::{Matrix, BaseMatrix}; | |
use rulinalg::utils; | |
use learning::toolkit::rand_utils; | |
const LEARNING_EPS: f64 = 1e-20; | |
/// Batch Gradient Descent algorithm | |
#[derive(Clone, Copy, Debug)] | |
pub struct GradientDesc { | |
/// The step-size for the gradient descent steps. | |
alpha: f64, | |
/// The number of iterations to run. | |
iters: usize, | |
} | |
/// The default gradient descent algorithm. | |
/// | |
/// The defaults are: | |
/// | |
/// - alpha = 0.3 | |
/// - iters = 100 | |
impl Default for GradientDesc { | |
fn default() -> GradientDesc { | |
GradientDesc { | |
alpha: 0.3, | |
iters: 100, | |
} | |
} | |
} | |
impl GradientDesc { | |
/// Construct a gradient descent algorithm. | |
/// | |
/// Requires the step size and iteration count | |
/// to be specified. | |
/// | |
/// # Examples | |
/// | |
/// ``` | |
/// use rusty_machine::learning::optim::grad_desc::GradientDesc; | |
/// | |
/// let gd = GradientDesc::new(0.3, 10000); | |
/// ``` | |
pub fn new(alpha: f64, iters: usize) -> GradientDesc { | |
assert!(alpha > 0f64, | |
"The step size (alpha) must be greater than 0."); | |
GradientDesc { | |
alpha: alpha, | |
iters: iters, | |
} | |
} | |
} | |
impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc { | |
fn optimize(&self, | |
model: &M, | |
start: &[f64], | |
inputs: &M::Inputs, | |
targets: &M::Targets) | |
-> Vec<f64> { | |
// Create the initial optimal parameters | |
let mut optimizing_val = Vector::new(start.to_vec()); | |
// The cost at the start of each iteration | |
let mut start_iter_cost = 0f64; | |
for _ in 0..self.iters { | |
// Compute the cost and gradient for the current parameters | |
let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets); | |
// Early stopping | |
if (start_iter_cost - cost).abs() < LEARNING_EPS { | |
break; | |
} else { | |
// Update the optimal parameters using gradient descent | |
optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha; | |
// Update the latest cost | |
start_iter_cost = cost; | |
} | |
} | |
optimizing_val.into_vec() | |
} | |
} | |
/// Stochastic Gradient Descent algorithm. | |
/// | |
/// Uses basic momentum to control the learning rate. | |
#[derive(Clone, Copy, Debug)] | |
pub struct StochasticGD { | |
/// Controls the momentum of the descent | |
alpha: f64, | |
/// The square root of the raw learning rate. | |
mu: f64, | |
/// The number of passes through the data. | |
iters: usize, | |
} | |
/// The default Stochastic GD algorithm. | |
/// | |
/// The defaults are: | |
/// | |
/// - alpha = 0.1 | |
/// - mu = 0.1 | |
/// - iters = 20 | |
impl Default for StochasticGD { | |
fn default() -> StochasticGD { | |
StochasticGD { | |
alpha: 0.1, | |
mu: 0.1, | |
iters: 20, | |
} | |
} | |
} | |
impl StochasticGD { | |
/// Construct a stochastic gradient descent algorithm. | |
/// | |
/// Requires the learning rate, momentum rate and iteration count | |
/// to be specified. | |
/// | |
/// With Nesterov momentum by default. | |
/// | |
/// # Examples | |
/// | |
/// ``` | |
/// use rusty_machine::learning::optim::grad_desc::StochasticGD; | |
/// | |
/// let sgd = StochasticGD::new(0.1, 0.3, 5); | |
/// ``` | |
pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD { | |
assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0."); | |
assert!(mu > 0f64, "The step size (mu) must be greater than 0."); | |
StochasticGD { | |
alpha: alpha, | |
mu: mu, | |
iters: iters, | |
} | |
} | |
} | |
impl<M> OptimAlgorithm<M> for StochasticGD | |
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> | |
{ | |
fn optimize(&self, | |
model: &M, | |
start: &[f64], | |
inputs: &M::Inputs, | |
targets: &M::Targets) | |
-> Vec<f64> { | |
// Create the initial optimal parameters | |
let mut optimizing_val = Vector::new(start.to_vec()); | |
// Create the momentum based gradient distance | |
let mut delta_w = Vector::zeros(start.len()); | |
// Set up the indices for permutation | |
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>(); | |
// The cost at the start of each iteration | |
let mut start_iter_cost = 0f64; | |
for _ in 0..self.iters { | |
// The cost at the end of each stochastic gd pass | |
let mut end_cost = 0f64; | |
// Permute the indices | |
rand_utils::in_place_fisher_yates(&mut permutation); | |
for i in &permutation { | |
// Compute the cost and gradient for this data pair | |
let (cost, vec_data) = model.compute_grad(optimizing_val.data(), | |
&inputs.select_rows(&[*i]), | |
&targets.select_rows(&[*i])); | |
// Backup previous velocity | |
let prev_w = delta_w.clone(); | |
// Compute the difference in gradient using Nesterov momentum | |
delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha; | |
// Update the parameters | |
optimizing_val = &optimizing_val - | |
(&prev_w * (-self.alpha) + &delta_w * (1. + self.alpha)); | |
// Set the end cost (this is only used after the last iteration) | |
end_cost += cost; | |
} | |
end_cost /= inputs.rows() as f64; | |
// Early stopping | |
if (start_iter_cost - end_cost).abs() < LEARNING_EPS { | |
break; | |
} else { | |
// Update the cost | |
start_iter_cost = end_cost; | |
} | |
} | |
optimizing_val.into_vec() | |
} | |
} | |
/// Adaptive Gradient Descent | |
/// | |
/// The adaptive gradient descent algorithm (Duchi et al. 2010). | |
#[derive(Debug)] | |
pub struct AdaGrad { | |
alpha: f64, | |
tau: f64, | |
iters: usize, | |
} | |
impl AdaGrad { | |
/// Constructs a new AdaGrad algorithm. | |
/// | |
/// # Examples | |
/// | |
/// ``` | |
/// use rusty_machine::learning::optim::grad_desc::AdaGrad; | |
/// | |
/// // Create a new AdaGrad algorithm with step size 0.5 | |
/// // and adaptive scaling constant 1.0 | |
/// let gd = AdaGrad::new(0.5, 1.0, 100); | |
/// ``` | |
pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad { | |
assert!(alpha > 0f64, | |
"The step size (alpha) must be greater than 0."); | |
assert!(tau >= 0f64, | |
"The adaptive constant (tau) cannot be negative."); | |
AdaGrad { | |
alpha: alpha, | |
tau: tau, | |
iters: iters, | |
} | |
} | |
} | |
impl Default for AdaGrad { | |
fn default() -> AdaGrad { | |
AdaGrad { | |
alpha: 1f64, | |
tau: 3f64, | |
iters: 100, | |
} | |
} | |
} | |
impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad { | |
fn optimize(&self, | |
model: &M, | |
start: &[f64], | |
inputs: &M::Inputs, | |
targets: &M::Targets) | |
-> Vec<f64> { | |
// Initialize the adaptive scaling | |
let mut ada_s = Vector::zeros(start.len()); | |
// Initialize the optimal parameters | |
let mut optimizing_val = Vector::new(start.to_vec()); | |
// Set up the indices for permutation | |
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>(); | |
// The cost at the start of each iteration | |
let mut start_iter_cost = 0f64; | |
for _ in 0..self.iters { | |
// The cost at the end of each stochastic gd pass | |
let mut end_cost = 0f64; | |
// Permute the indices | |
rand_utils::in_place_fisher_yates(&mut permutation); | |
for i in &permutation { | |
// Compute the cost and gradient for this data pair | |
let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(), | |
&inputs.select_rows(&[*i]), | |
&targets.select_rows(&[*i])); | |
// Update the adaptive scaling by adding the gradient squared | |
utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y); | |
// Compute the change in gradient | |
utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| { | |
*x = self.alpha * (*x / (self.tau + (y).sqrt())) | |
}); | |
// Update the parameters | |
optimizing_val = &optimizing_val - Vector::new(vec_data); | |
// Set the end cost (this is only used after the last iteration) | |
end_cost += cost; | |
} | |
end_cost /= inputs.rows() as f64; | |
// Early stopping | |
if (start_iter_cost - end_cost).abs() < LEARNING_EPS { | |
break; | |
} else { | |
// Update the cost | |
start_iter_cost = end_cost; | |
} | |
} | |
optimizing_val.into_vec() | |
} | |
} | |
/// RMSProp | |
/// | |
/// The RMSProp algorithm (Hinton et al. 2012). | |
#[derive(Debug, Clone, Copy)] | |
pub struct RMSProp { | |
/// The base step size of gradient descent steps | |
learning_rate: f64, | |
/// Rate at which running total of average square gradients decays | |
decay_rate: f64, | |
/// Small value used to avoid divide by zero | |
epsilon: f64, | |
/// The number of passes through the data | |
iters: usize, | |
} | |
/// The default RMSProp configuration | |
/// | |
/// The defaults are: | |
/// | |
/// - learning_rate = 0.01 | |
/// - decay_rate = 0.9 | |
/// - epsilon = 1.0e-5 | |
/// - iters = 50 | |
impl Default for RMSProp { | |
fn default() -> RMSProp { | |
RMSProp { | |
learning_rate: 0.01, | |
decay_rate: 0.9, | |
epsilon: 1.0e-5, | |
iters: 50 | |
} | |
} | |
} | |
impl RMSProp { | |
/// Construct an RMSProp algorithm. | |
/// | |
/// Requires learning rate, decay rate, epsilon, and iteration count. | |
/// | |
/// #Examples | |
/// | |
/// ``` | |
/// use rusty_machine::learning::optim::grad_desc::RMSProp; | |
/// | |
/// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20); | |
/// ``` | |
pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp { | |
assert!(0f64 < learning_rate, "The learning rate must be positive"); | |
assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1"); | |
assert!(0f64 < epsilon, "Epsilon must be positive"); | |
RMSProp { | |
decay_rate: decay_rate, | |
learning_rate: learning_rate, | |
epsilon: epsilon, | |
iters: iters | |
} | |
} | |
} | |
impl<M> OptimAlgorithm<M> for RMSProp | |
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> { | |
fn optimize(&self, | |
model: &M, | |
start: &[f64], | |
inputs: &M::Inputs, | |
targets: &M::Targets) | |
-> Vec<f64> { | |
// Initial parameters | |
let mut params = Vector::new(start.to_vec()); | |
// Running average of squared gradients | |
let mut rmsprop_cache = Vector::zeros(start.len()); | |
// Set up indices for permutation | |
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>(); | |
// The cost from the previous iteration | |
let mut prev_cost = 0f64; | |
for _ in 0..self.iters { | |
// The cost at end of each pass | |
let mut end_cost = 0f64; | |
// Permute the vertices | |
rand_utils::in_place_fisher_yates(&mut permutation); | |
for i in &permutation { | |
let (cost, grad) = model.compute_grad(params.data(), | |
&inputs.select_rows(&[*i]), | |
&targets.select_rows(&[*i])); | |
let mut grad = Vector::new(grad); | |
let grad_squared = grad.clone().apply(&|x| x*x); | |
// Update cached average of squared gradients | |
rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate); | |
// RMSProp update rule | |
utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| { | |
*x = *x * self.learning_rate / (y + self.epsilon).sqrt(); | |
}); | |
params = ¶ms - &grad; | |
end_cost += cost; | |
} | |
end_cost /= inputs.rows() as f64; | |
// Early stopping | |
if (prev_cost - end_cost).abs() < LEARNING_EPS { | |
break; | |
} else { | |
prev_cost = end_cost; | |
} | |
} | |
params.into_vec() | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp}; | |
#[test] | |
#[should_panic] | |
fn gd_neg_stepsize() { | |
let _ = GradientDesc::new(-0.5, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn stochastic_gd_neg_momentum() { | |
let _ = StochasticGD::new(-0.5, 1f64, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn stochastic_gd_neg_stepsize() { | |
let _ = StochasticGD::new(0.5, -1f64, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn adagrad_neg_stepsize() { | |
let _ = AdaGrad::new(-0.5, 1f64, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn adagrad_neg_adaptive_scale() { | |
let _ = AdaGrad::new(0.5, -1f64, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn rmsprop_neg_decay_rate() { | |
let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn rmsprop_neg_epsilon() { | |
let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0); | |
} | |
#[test] | |
#[should_panic] | |
fn rmsprop_neg_learning_rate() { | |
let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0); | |
} | |
} |