AdaDelta Solver (v3) #2782
Merged
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
Jump to file or symbol
Failed to load files and symbols.
| @@ -0,0 +1,24 @@ | ||
| +# The train/test net protocol buffer definition | ||
| +net: "examples/mnist/lenet_train_test.prototxt" | ||
| +# test_iter specifies how many forward passes the test should carry out. | ||
| +# In the case of MNIST, we have test batch size 100 and 100 test iterations, | ||
| +# covering the full 10,000 testing images. | ||
| +test_iter: 100 | ||
| +# Carry out testing every 500 training iterations. | ||
| +test_interval: 500 | ||
| +# The base learning rate, momentum and the weight decay of the network. | ||
| +base_lr: 1.0 | ||
| +lr_policy: "fixed" | ||
| +momentum: 0.95 | ||
| +weight_decay: 0.0005 | ||
| +# Display every 100 iterations | ||
| +display: 100 | ||
| +# The maximum number of iterations | ||
| +max_iter: 10000 | ||
| +# snapshot intermediate results | ||
| +snapshot: 5000 | ||
| +snapshot_prefix: "examples/mnist/lenet_adadelta" | ||
| +# solver mode: CPU or GPU | ||
| +solver_mode: GPU | ||
| +solver_type: ADADELTA | ||
| +delta: 1e-6 |
| @@ -0,0 +1,19 @@ | ||
| +net: "examples/mnist/mnist_autoencoder.prototxt" | ||
| +test_state: { stage: 'test-on-train' } | ||
| +test_iter: 500 | ||
| +test_state: { stage: 'test-on-test' } | ||
| +test_iter: 100 | ||
| +test_interval: 500 | ||
| +test_compute_loss: true | ||
| +base_lr: 1.0 | ||
| +lr_policy: "fixed" | ||
| +momentum: 0.95 | ||
| +delta: 1e-8 | ||
| +display: 100 | ||
| +max_iter: 65000 | ||
| +weight_decay: 0.0005 | ||
| +snapshot: 10000 | ||
| +snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train" | ||
| +# solver mode: CPU or GPU | ||
| +solver_mode: GPU | ||
| +solver_type: ADADELTA |
| @@ -0,0 +1,4 @@ | ||
| +#!/bin/bash | ||
| + | ||
| +./build/tools/caffe train \ | ||
| + --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt |
| @@ -934,10 +934,157 @@ void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { | ||
| } | ||
| } | ||
| +template <typename Dtype> | ||
| +void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() { | ||
| + // Add the extra history entries for AdaDelta after those from | ||
| + // SGDSolver::PreSolve | ||
| + const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); | ||
| + for (int i = 0; i < net_params.size(); ++i) { | ||
| + const vector<int>& shape = net_params[i]->shape(); | ||
| + this->history_.push_back( | ||
| + shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); | ||
| + } | ||
| +} | ||
| + | ||
| +template <typename Dtype> | ||
| +void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { | ||
| + const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); | ||
| + const vector<float>& net_params_lr = this->net_->params_lr(); | ||
| + Dtype delta = this->param_.delta(); | ||
| + Dtype momentum = this->param_.momentum(); | ||
| + Dtype local_rate = rate * net_params_lr[param_id]; | ||
| + size_t update_history_offset = net_params.size(); | ||
| + switch (Caffe::mode()) { | ||
| + case Caffe::CPU: { | ||
| + // compute square of gradient in update | ||
| + caffe_powx(net_params[param_id]->count(), | ||
| + net_params[param_id]->cpu_diff(), Dtype(2), | ||
| + this->update_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + // update history of gradients | ||
| + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
| + this->update_[param_id]->cpu_data(), momentum, | ||
| + this->history_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + // add delta to history to guard against dividing by zero later | ||
| + caffe_set(net_params[param_id]->count(), delta, | ||
| + this->temp_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + caffe_add(net_params[param_id]->count(), | ||
| + this->temp_[param_id]->cpu_data(), | ||
| + this->history_[update_history_offset + param_id]->cpu_data(), | ||
| + this->update_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + caffe_add(net_params[param_id]->count(), | ||
| + this->temp_[param_id]->cpu_data(), | ||
| + this->history_[param_id]->cpu_data(), | ||
| + this->temp_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + // divide history of updates by history of gradients | ||
| + caffe_div(net_params[param_id]->count(), | ||
| + this->update_[param_id]->cpu_data(), | ||
| + this->temp_[param_id]->cpu_data(), | ||
| + this->update_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + // jointly compute the RMS of both for update and gradient history | ||
| + caffe_powx(net_params[param_id]->count(), | ||
| + this->update_[param_id]->cpu_data(), Dtype(0.5), | ||
| + this->update_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + // compute the update | ||
| + caffe_mul(net_params[param_id]->count(), | ||
| + net_params[param_id]->cpu_diff(), | ||
| + this->update_[param_id]->cpu_data(), | ||
| + net_params[param_id]->mutable_cpu_diff()); | ||
| + | ||
| + // compute square of update | ||
| + caffe_powx(net_params[param_id]->count(), | ||
| + net_params[param_id]->cpu_diff(), Dtype(2), | ||
| + this->update_[param_id]->mutable_cpu_data()); | ||
| + | ||
| + // update history of updates | ||
| + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
| + this->update_[param_id]->cpu_data(), momentum, | ||
| + this->history_[update_history_offset + param_id]->mutable_cpu_data()); | ||
|
|
||
| + | ||
| + // apply learning rate | ||
| + caffe_cpu_scale(net_params[param_id]->count(), local_rate, | ||
| + net_params[param_id]->cpu_diff(), | ||
| + net_params[param_id]->mutable_cpu_diff()); | ||
| + break; | ||
| + } | ||
| + case Caffe::GPU: { | ||
| +#ifndef CPU_ONLY | ||
| + // compute square of gradient in update | ||
| + caffe_gpu_powx(net_params[param_id]->count(), | ||
| + net_params[param_id]->gpu_diff(), Dtype(2), | ||
| + this->update_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + // update history of gradients | ||
| + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
| + this->update_[param_id]->gpu_data(), momentum, | ||
| + this->history_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + // add delta to history to guard against dividing by zero later | ||
| + caffe_gpu_set(net_params[param_id]->count(), delta, | ||
| + this->temp_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + caffe_gpu_add(net_params[param_id]->count(), | ||
| + this->temp_[param_id]->gpu_data(), | ||
| + this->history_[update_history_offset + param_id]->gpu_data(), | ||
| + this->update_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + caffe_gpu_add(net_params[param_id]->count(), | ||
| + this->temp_[param_id]->gpu_data(), | ||
| + this->history_[param_id]->gpu_data(), | ||
| + this->temp_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + // divide history of updates by history of gradients | ||
| + caffe_gpu_div(net_params[param_id]->count(), | ||
| + this->update_[param_id]->gpu_data(), | ||
| + this->temp_[param_id]->gpu_data(), | ||
| + this->update_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + // jointly compute the RMS of both for update and gradient history | ||
| + caffe_gpu_powx(net_params[param_id]->count(), | ||
| + this->update_[param_id]->gpu_data(), Dtype(0.5), | ||
| + this->update_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + // compute the update and copy to net_diff | ||
| + caffe_gpu_mul(net_params[param_id]->count(), | ||
| + net_params[param_id]->gpu_diff(), | ||
| + this->update_[param_id]->gpu_data(), | ||
| + net_params[param_id]->mutable_gpu_diff()); | ||
| + | ||
| + // compute square of update | ||
| + caffe_gpu_powx(net_params[param_id]->count(), | ||
| + net_params[param_id]->gpu_diff(), Dtype(2), | ||
| + this->update_[param_id]->mutable_gpu_data()); | ||
| + | ||
| + // update history of updates | ||
| + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, | ||
| + this->update_[param_id]->gpu_data(), momentum, | ||
| + this->history_[update_history_offset + param_id]->mutable_gpu_data()); | ||
ronghanghu
Member
|
||
| + | ||
| + // apply learning rate | ||
| + caffe_gpu_scale(net_params[param_id]->count(), local_rate, | ||
| + net_params[param_id]->gpu_diff(), | ||
| + net_params[param_id]->mutable_gpu_diff()); | ||
| +#else | ||
| + NO_GPU; | ||
| +#endif | ||
| + break; | ||
| + } | ||
| + default: | ||
| + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); | ||
| + } | ||
| +} | ||
| + | ||
| INSTANTIATE_CLASS(Solver); | ||
| INSTANTIATE_CLASS(SGDSolver); | ||
| INSTANTIATE_CLASS(NesterovSolver); | ||
| INSTANTIATE_CLASS(AdaGradSolver); | ||
| INSTANTIATE_CLASS(RMSPropSolver); | ||
| +INSTANTIATE_CLASS(AdaDeltaSolver); | ||
| } // namespace caffe | ||
Oops, something went wrong.
let's add
local_ratemultiplication after this line, where you have computed square of update (don't scale update bylocal_ratebefore computing square of update).