# Annealing

In [None]:
%install '.package(path: "$cwd/FastaiNotebooks")' FastaiNotebooks

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebooks")
		FastaiNotebooks
With SwiftPM flags: []
Working in: /tmp/tmprbmf5ijf
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 2.01s
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'FastaiNotebooks' (6 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Loading library...
Installation complete!


## Load data

In [None]:
import FastaiNotebooks

In [None]:
// export
import Path
import TensorFlow

In [None]:
let data = mnistDataBunch(flat: true)

In [None]:
let (n,m) = (60000,784)
let c = 10
let nHid = 50

In [None]:
let opt = SGD<BasicModel, Float>(learningRate: 1e-2)

In [None]:
func modelInit() -> BasicModel {return BasicModel(nIn: m, nHid: nHid, nOut: c)}

In [None]:
func lossOutputWithGrad(
    model: BasicModel,
    in context: Context,
    inputs: Tensor<Float>,
    labels: Tensor<Int32>
) -> (Tensor<Float>, BasicModel.Output, BasicModel.CotangentVector) {
    var outputs: BasicModel.Output? = nil
    let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
        let predictions = model.applied(to: inputs, in: context)
        outputs = predictions
        return softmaxCrossEntropy(logits: predictions, labels: labels)
    }
    return (loss, outputs!, grads)
}

In [None]:
let learner = Learner(data: data, lossOutputWithGradient: lossOutputWithGrad, optimizer: opt, initializingWith: modelInit)

In [None]:
learner.delegates = [Learner.TrainEvalDelegate(), Learner.AvgMetric(metrics: [accuracy])]

In [None]:
learner.fit(2)

Beginning epoch 0
[0.4615099, 0.8826]
Beginning epoch 1
[0.3536407, 0.903]


## Add Callbacks

The code below adds callbacks and defines a new training loop.

In [None]:
/// CallbackResult allows callbacks to control the training loop.
public enum CallbackResult {
    /// Proceed with the training step.
    case proceed
    /// Skip the rest of the training step, and move immediately to the next step.
    case skip
    /// Stop training.
    case stop
}


open class TrainingCallbacks<M, O: Optimizer, S> 
    where O.Model == M, O.Scalar == S,
          M.Input == Tensor<S>, M.Output == Tensor<S> {
              
    open func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        return .proceed
    }
    
    // TODO: Figure out what to pass here!
    open func beforeBatch() -> CallbackResult {
        return .proceed
    }
    
    open func afterBatch(loss: inout Tensor<S>) -> CallbackResult {
        return .proceed
    }
    
}

In [None]:
class Recorder<M, O: Optimizer, S>: TrainingCallbacks<M, O, S>
    where O.Model == M, O.Scalar == S,
          M.Input == Tensor<S>, M.Output == Tensor<S> {
    private var optimizer: O? = nil
    private var losses: [S] = []
    private var lrs: [O.Scalar] = []
    override func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        self.optimizer = optimizer
        return .proceed
    }
    
    override func afterBatch(loss: inout Tensor<S>) -> CallbackResult {
        lrs.append(optimizer!.learningRate)
        losses.append(loss.scalarized())
        return .proceed
    }
}

In [None]:
/// Simple SGD optimizer with a modifiable learning rate.
public class SettableSGD<Model: Layer>: Optimizer
    where Model.AllDifferentiableVariables == Model.CotangentVector {
    /// The learning rate.
    public var learningRate: Float {
        willSet(newLearningRate) {
            precondition(newLearningRate >= 0, "Learning rate must be non-negative")
        }
    }

    public init(learningRate: Float = 0.01) {
        precondition(learningRate >= 0, "Learning rate must be non-negative")
        self.learningRate = learningRate
    }

    public func update(_ model: inout Model.AllDifferentiableVariables,
                       along direction: Model.CotangentVector) {
        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Scalar>.self) {
            model[keyPath: kp] += learningRate * direction[keyPath: kp]
        }
    }
}


In [None]:
let foo = SettableSGD<Dense<Float>>()

In [None]:
foo.learningRate

In [None]:
foo.learningRate = 0.2

In [None]:
foo.learningRate

In [None]:
/// A non-generalized learning rate scheduler
class LearningRateScheduler<M, O: SettableSGD<M>>: TrainingCallbacks<M, O, Float>
    where O.Model == M,
          M.Input == Tensor<Float>, M.Output == Tensor<Float> {
    
    // A learning rate schedule from step to float.
    typealias ScheduleFunc = (Int) -> Float

    private var optimizer: O?
    private let scheduler: ScheduleFunc
    private var step = 0
    
    init(scheduler: @escaping ScheduleFunc) {
        self.scheduler = scheduler
    }

    override func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        self.optimizer = optimizer
        return .proceed
    }
              
    override func beforeBatch() -> CallbackResult {
        step += 1
        self.optimizer!.learningRate = scheduler(step)
        return .proceed
    }
    
}

In [None]:
class SequentialCallbacks< M, O: Optimizer, S>: TrainingCallbacks<M, O, S>
    where O.Model == M, O.Scalar == S,
          M.Input == Tensor<S>, M.Output == Tensor<S> {
    
    private let callbacks: [TrainingCallbacks<M, O, S>]

    init(_ callbacks: [TrainingCallbacks<M, O, S>]) {
        self.callbacks = callbacks
    }
    convenience init(_ callbacks: TrainingCallbacks<M, O, S>...) {
        self.init(callbacks)
    }
              
    override func beforeTrain(model: inout M, optimizer: inout O) -> CallbackResult {
        for cb in callbacks {
            let cbResult = cb.beforeTrain(model: &model, optimizer: &optimizer)
            switch cbResult {
                case .stop, .skip: return cbResult
                case .proceed: break
            }
        }
        return .proceed
    }
    
    // TODO: Figure out what to pass here!
    override func beforeBatch() -> CallbackResult {
        for cb in callbacks {
            let cbResult = cb.beforeBatch()
            switch cbResult {
                case .stop, .skip: return cbResult
                case .proceed: break
            }
        }
        return .proceed
    }
    
    override func afterBatch(loss: inout Tensor<S>) -> CallbackResult {
        for cb in callbacks {
            let cbResult = cb.afterBatch(loss: &loss)
            switch cbResult {
                case .stop, .skip: return cbResult
                case .proceed: break
            }
        }
        return .proceed
    }
}

In [None]:
/// A training loop, now improved with callbacks!
public func trainWithCallbacks<M, O: Optimizer, S>(
    _ model: inout M,
    at variablesKeyPath: WritableKeyPath<M, M.AllDifferentiableVariables>,
    on dataset: Dataset<Example<S, S>>,
    using optimizer: inout O,
    loss: @escaping @differentiable (Tensor<S>, Tensor<S>) -> Tensor<S>,
    callbacks: TrainingCallbacks<M, O, S>
) where O.Model == M, O.Scalar == S,
        M.Input == Tensor<S>, M.Output == Tensor<S>
{
    let context = Context(learningPhase: .training)
    callbacks.beforeTrain(model: &model, optimizer: &optimizer)
    for batch in dataset {
        callbacks.beforeBatch()  // TODO: pass in batch!
        let (x, y) = (batch.data, batch.labels)
        var (loss, (𝛁model, _)) = model.valueWithGradient(at: y) { (model, y) -> Tensor<S> in
            let preds = model.applied(to: x, in: context)
            return loss(preds, y)
        }
        callbacks.afterBatch(loss: &loss)
        print(loss)
        optimizer.update(&model[keyPath: variablesKeyPath], along: 𝛁model)
    }
}