diff --git a/DIRECTORY.md b/DIRECTORY.md index 564a7813807..7a800e41379 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -174,6 +174,7 @@ * Optimization * [Adam](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/optimization/adam.rs) * [Gradient Descent](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/optimization/gradient_descent.rs) + * [Momentum](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/optimization/momentum.rs) * Math * [Abs](https://github.com/TheAlgorithms/Rust/blob/master/src/math/abs.rs) * [Aliquot Sum](https://github.com/TheAlgorithms/Rust/blob/master/src/math/aliquot_sum.rs) diff --git a/src/machine_learning/optimization/mod.rs b/src/machine_learning/optimization/mod.rs index 7a962993beb..87f09601ab1 100644 --- a/src/machine_learning/optimization/mod.rs +++ b/src/machine_learning/optimization/mod.rs @@ -1,5 +1,6 @@ mod adam; mod gradient_descent; +mod momentum; pub use self::adam::Adam; pub use self::gradient_descent::gradient_descent; diff --git a/src/machine_learning/optimization/momentum.rs b/src/machine_learning/optimization/momentum.rs new file mode 100644 index 00000000000..a9b5b9e91c5 --- /dev/null +++ b/src/machine_learning/optimization/momentum.rs @@ -0,0 +1,144 @@ +/// Momentum Optimization +/// +/// Momentum is an extension of gradient descent that accelerates convergence by accumulating +/// a velocity vector in directions of persistent reduction in the objective function. +/// This helps the optimizer navigate ravines and avoid getting stuck in local minima. +/// +/// The algorithm maintains a velocity vector that accumulates exponentially decaying moving +/// averages of past gradients. This allows the optimizer to build up speed in consistent +/// directions while dampening oscillations. +/// +/// The update equations are: +/// velocity_{k+1} = beta * velocity_k + gradient_of_function(x_k) +/// x_{k+1} = x_k - learning_rate * velocity_{k+1} +/// +/// where beta (typically 0.9) controls how much past gradients influence the current update. +/// +/// # Arguments +/// +/// * `derivative_fn` - The function that calculates the gradient of the objective function at a given point. +/// * `x` - The initial parameter vector to be optimized. +/// * `learning_rate` - Step size for each iteration. +/// * `beta` - Momentum coefficient (typically 0.9). Higher values give more weight to past gradients. +/// * `num_iterations` - The number of iterations to run the optimization. +/// +/// # Returns +/// +/// A reference to the optimized parameter vector `x`. +#[allow(dead_code)] +pub fn momentum( + derivative: impl Fn(&[f64]) -> Vec, + x: &mut Vec, + learning_rate: f64, + beta: f64, + num_iterations: i32, +) -> &mut Vec { + // Initialize velocity vector to zero + let mut velocity: Vec = vec![0.0; x.len()]; + + for _ in 0..num_iterations { + let gradient = derivative(x); + + // Update velocity and parameters + for ((x_k, vel), grad) in x.iter_mut().zip(velocity.iter_mut()).zip(gradient.iter()) { + *vel = beta * *vel + grad; + *x_k -= learning_rate * *vel; + } + } + x +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_momentum_optimized() { + fn derivative_of_square(params: &[f64]) -> Vec { + params.iter().map(|x| 2.0 * x).collect() + } + + let mut x: Vec = vec![5.0, 6.0]; + let learning_rate: f64 = 0.01; + let beta: f64 = 0.9; + let num_iterations: i32 = 1000; + + let minimized_vector = momentum( + derivative_of_square, + &mut x, + learning_rate, + beta, + num_iterations, + ); + + let test_vector = [0.0, 0.0]; + let tolerance = 1e-6; + + for (minimized_value, test_value) in minimized_vector.iter().zip(test_vector.iter()) { + assert!((minimized_value - test_value).abs() < tolerance); + } + } + + #[test] + fn test_momentum_unoptimized() { + fn derivative_of_square(params: &[f64]) -> Vec { + params.iter().map(|x| 2.0 * x).collect() + } + + let mut x: Vec = vec![5.0, 6.0]; + let learning_rate: f64 = 0.01; + let beta: f64 = 0.9; + let num_iterations: i32 = 10; + + let minimized_vector = momentum( + derivative_of_square, + &mut x, + learning_rate, + beta, + num_iterations, + ); + + let test_vector = [0.0, 0.0]; + let tolerance = 1e-6; + + for (minimized_value, test_value) in minimized_vector.iter().zip(test_vector.iter()) { + assert!((minimized_value - test_value).abs() >= tolerance); + } + } + + #[test] + fn test_momentum_faster_than_gd() { + fn derivative_of_square(params: &[f64]) -> Vec { + params.iter().map(|x| 2.0 * x).collect() + } + + // Test that momentum converges faster than gradient descent + let mut x_momentum: Vec = vec![5.0, 6.0]; + let mut x_gd: Vec = vec![5.0, 6.0]; + let learning_rate: f64 = 0.01; + let beta: f64 = 0.9; + let num_iterations: i32 = 50; + + momentum( + derivative_of_square, + &mut x_momentum, + learning_rate, + beta, + num_iterations, + ); + + // Gradient descent from your original implementation + for _ in 0..num_iterations { + let gradient = derivative_of_square(&x_gd); + for (x_k, grad) in x_gd.iter_mut().zip(gradient.iter()) { + *x_k -= learning_rate * grad; + } + } + + // Momentum should be closer to zero + let momentum_distance: f64 = x_momentum.iter().map(|x| x * x).sum(); + let gd_distance: f64 = x_gd.iter().map(|x| x * x).sum(); + + assert!(momentum_distance < gd_distance); + } +}