In [None]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_01a_fastai_layers")' FastaiNotebook_01a_fastai_layers

Installing packages:
	.package(path: "/home/ubuntu/dev_swift/FastaiNotebook_01a_fastai_layers")
		FastaiNotebook_01a_fastai_layers
With SwiftPM flags: []
Working in: /tmp/tmp06qvc72w/swift-install
Updating https://github.com/latenitesoft/NotebookExport
Updating https://github.com/mxcl/Path.swift
Updating https://github.com/JustHTTP/Just
Completed resolution in 1.68s
Compile Swift Module 'FastaiNotebook_01a_fastai_layers' (3 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Installation complete!


In [None]:
//export
import Path
import TensorFlow

In [None]:
import FastaiNotebook_01a_fastai_layers

## The forward and backward passes

Typing `Tensor<Float>` all the time is tedious. The S4TF team expects to make `Float` be the default so we can just say `Tensor`.  Until that happens though, we can define our own alias.

In [None]:
// export
public typealias TF=Tensor<Float>

We will need to normalize our data.

In [None]:
// export
public func normalize(_ x:TF, mean:TF, std:TF) -> TF {
    return (x-mean)/std
}

In [None]:
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

Normalize the training and validation sets with the training set statistics.

In [None]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.std()
print(trainMean, trainStd)

0.13066047 0.3081079


In [None]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

To test everything is going well:

In [None]:
//export
public func testNearZero(_ a: TF, tolerance: Float = 1e-3) {
    assert(abs(a) < tolerance, "Near zero: \(a)")
}

In [None]:
testNearZero(xTrain.mean())
testNearZero(xTrain.std() - 1.0)

In [None]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max()+1
print(n, m, c)

60000 784 10


## Foundations version

### Basic architecture

In [None]:
//num hidden
let nh = 50

In [None]:
// simplified kaiming init / he init
let w1 = TF(randomNormal: [m, nh]) / sqrt(Float(m))
let b1 = TF(zeros: [nh])
let w2 = TF(randomNormal: [nh,1]) / sqrt(Float(nh))
let b2 = TF(zeros: [1])

In [None]:
testNearZero(w1.mean())
testNearZero(w1.std()-1/sqrt(Float(m)))

In [None]:
// This should be ~ (0,1) (mean,std)...
(xValid.mean(),xValid.std())

▿ 2 elements
  - .0 : 0.006017743
  - .1 : 1.0076997


Instead of `@` in python we use `•` (or `matmul`) in Swift:

In [None]:
func lin(_ x: TF, _ w: TF, _ b: TF) -> TF { return x•w+b }

In [None]:
let t = lin(xValid, w1, b1)

In [None]:
//...so should this, because we used kaiming init, which is designed to do this
print(t.mean(), t.std())

-0.115693174 0.98209363


In [None]:
func myRelu(_ x:TF) -> TF { return max(x, 0) }

In [None]:
let t = myRelu(lin(xValid, w1, b1))

In [None]:
//...actually it really should be this!
print(t.mean(),t.std())

0.33261237 0.5209647


In [None]:
// kaiming init / he init for relu
let w1 = TF(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [None]:
print(w1.mean(), w1.std())

-0.0006689982 0.05056813


In [None]:
let t = myRelu(lin(xValid, w1, b1))
print(t.mean(), t.std())

0.6856287 0.93599206


Here is a simple basic model:

In [None]:
func model(_ xb: TF) -> TF {
    let l1 = lin(xb, w1, b1)
    let l2 = myRelu(l1)
    let l3 = lin(l2, w2, b2)
    return l3
}

In [None]:
time(repeating: 10) { _ = model(xValid) }

average: 0.9829972000000001 ms,   min: 0.941824 ms,   max: 1.104874 ms


### Loss function

We begin with the mean squared error to have easier gradient computations.

In [None]:
let preds = model(xTrain)

In [None]:
// export
public func mse(_ out: TF, _ targ: TF) -> TF {
    return (out.squeezingShape(at: -1) - targ).squared().mean()
}

One more step comapred to python, we have to make sure our labels are properly converted to floats.

In [None]:
// Convert these to Float dtype.
var yTrainF = TF(yTrain)
var yValidF = TF(yValid)

In [None]:
mse(preds, yTrainF)

42.934517


## Gradients and backward pass

Here we should how to calculate gradients for a simple model the hard way, manually.

To store the gradients a bit like in PyTorch we introduce a `TensorWithGrad` class that has two attributes: the original tensor and the gradient. We choose a class to easily replicate the Python notebook: classes are reference types (which means they are mutable) while structures are value types.

In fact, since this is the first time we're discovering Swift classes, let's jump into a [sidebar discussion about Value Semantics vs Reference Semantics](https://docs.google.com/presentation/d/1dc6o2o-uYGnJeCeyvgsgyk05dBMneArxdICW5vF75oU/edit#slide=id.g5669969ead_0_145) since it is a pretty fundamental part of the programming model and a huge step forward that Swift takes.

When we get back, we'll keep charging on, even though this is very non-idiomatic Swift code!


In [None]:
/// WARNING: This is designed to be similar to the PyTorch 02_fully_connected lesson,
/// this isn't idiomatic Swift code.
class TensorWithGrad {
    var inner, grad:  TF
    
    init(_ x: TF) {
        inner = x
        grad = TF(zeros: x.shape)
    } 
}

In [None]:
// Redefine our functions on TensorWithGrad.
func lin(_ x: TensorWithGrad, _ w: TensorWithGrad, _ b: TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)
}
func myRelu(_ x: TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(max(x.inner, 0))
}
func mse(_ inp: TensorWithGrad, _ targ: TF) -> TF {
    //grad of loss with respect to output of previous layer
    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
// Define our gradient functions.
func mseGrad(_ inp: TensorWithGrad, _ targ: TF) {
    //grad of loss with respect to output of previous layer
    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

func reluGrad(_ inp: TensorWithGrad, _ out: TensorWithGrad) {
    //grad of relu with respect to input activations
    inp.grad = (inp.inner .> 0).selecting(out.grad, TF(zeros: inp.inner.shape))
}

This is our python version (we've renamed the python `g` to `grad` for consistency):

```python
def lin_grad(inp, out, w, b):
    inp.grad = out.grad @ w.t()
    w.grad = (inp.unsqueeze(-1) * out.grad.unsqueeze(1)).sum(0)
    b.grad = out.grad.sum(0)
```

In Swift `@` is spelled `•`, which is <kbd>option</kbd>-<kbd>8</kbd> on Mac or <kbd>compose</kbd>-<kbd>.</kbd>-<kbd>=</kbd> elsewhere. Or just use the `matmul()` function we've seen already.

In [None]:
func linGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){
    // grad of linear layer with respect to input activations, weights and bias
    inp.grad = out.grad • w.inner.transposed()
    w.grad = inp.inner.transposed() • out.grad
    b.grad = out.grad.sum(squeezingAxes: 0)
}

In [None]:
let w1a = TensorWithGrad(w1)
let b1a = TensorWithGrad(b1)
let w2a = TensorWithGrad(w2)
let b2a = TensorWithGrad(b2)

In [None]:
func forwardAndBackward(_ inp:TensorWithGrad, _ targ:TF){
    //forward pass:
    let l1 = lin(inp, w1a, b1a)
    let l2 = myRelu(l1)
    let out = lin(l2, w2a, b2a)
    //we don't actually need the loss in backward!
    let loss = mse(out, targ)
    
    //backward pass:
    mseGrad(out, targ)
    linGrad(l2, out, w2a, b2a)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1a, b1a)
}

In [None]:
let inp = TensorWithGrad(xTrain)

In [None]:
forwardAndBackward(inp, yTrainF)

## The swift way

As we said before, swift operates in a different way. If we go back to what is happening in the backward pass, we go from the end result (our loss) which allows us to compute the gradient of that loss with respect to the last activations. Then consider all the layers we went through during the forward pass in the reversed order (from the last one to the first one) and for each of them, compute the gradients of the loss with respect to the inputs (and potentially parameters) from the gradients of the loss with respect to the outputs.

For instance if we go back to the basic `relu_grad` function we had in python:
```
def relu_grad(inp, out):
    # grad of relu with respect to input activations
    inp.g = (inp>0).float() * out.g
```
we explain how we infer the gradients of the loss with respect to the inputs of the relu (`inp.g`) from the gradients of the loss with respect to the outputs of that same relu (`out.g`).

Swift implements differentation in a more functional way than PyTorch: there is no grad attribute, instead we just provide that function that will take the gradients with respect to the outputs and returns the gradients with respect to the inputs (and potentially, parameters).

The tricky thing is that that gradient computation often requires to have the inputs/outputs of the layer: if we look at `relu_grad` up there, it needs to know the value of `inp`. That's why we don't just write a function that does `𝛁Out -> 𝛁Inp`, but a function that will take the input and return that pullback:
```
(Inp) -> ((𝛁Out) -> 𝛁Inp)
```

Let's look at what it gives us for the relu:

In [None]:
func diffRelu(_ inp: TF) -> ((TF) -> TF) {
    return {𝛁out -> TF in
        (inp .> 0).selecting(𝛁out, TF(zeros: inp.shape))
    }
}

When we go through our relu layer, we won't just ask for `y = relu(x)`, but will also store the pullback `pb = diffRelu(x)` for the backward pass. This will automatically capture a reference to the value of `x` that is used inside that pullback. 

Other differentiation functions look a bit the same:

In [None]:
func diffMse(_ inp: TF, _ targ: TF) -> ((TF) -> TF) {
    return { 𝛁loss in
        2.0 * (inp.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.shape[0])
    }
}

`diffMse` doesn't return gradients for targ because the function isn't differentiable with respect to that variable in general (we could of course differentiate it in this case, but we don't need those gradients).

In [None]:
func diffLin(_ inp: TF, _ w: TF, _ b: TF) -> ((TF) -> (TF, TF, TF)) {
    return { 𝛁out in
        (𝛁out • w.transposed(), inp.transposed() • 𝛁out, 𝛁out.sum(squeezingAxes: 0))
    }
}

`diffLin` returns the gradients with respect to all its inputs (more like inputs and parameters).

Then the backward and forward pass is written like this:

In [None]:
func forwardAndBackward(_ inp: TF, _ targ: TF) -> (TF, TF, TF, TF, TF){
    //forward pass:
    let (l1, pbL1)    = (lin(inp, w1, b1), diffLin(inp, w1, b1))
    let (l2, pbL2)    = (myRelu(l1), diffRelu(l1))
    let (out, pbOut)  = (lin(l2, w2, b2), diffLin(l2, w2, b2))
    //we don't actually need the loss in backward, but we need the pullback.
    let (loss, pbLoss) = (mse(out, targ), diffMse(out, targ))
    
    //backward pass:
    let 𝛁loss = TF(1) //We don't really need it but the gradient of the loss with respect to itself is 1
    let 𝛁out = pbLoss(𝛁loss)
    let (𝛁l2, 𝛁w2, 𝛁b2) = pbOut(𝛁out)
    let 𝛁l1 = pbL2(𝛁l2)
    let (𝛁inp, 𝛁w1, 𝛁b1) = pbL1(𝛁l1)
    return (𝛁inp, 𝛁w1, 𝛁b1, 𝛁w2, 𝛁b2)
}

In [None]:
let (𝛁xTrain, 𝛁w1, 𝛁b1, 𝛁w2, 𝛁b2) = forwardAndBackward(xTrain, yTrainF)

In [None]:
// Check the gradients computed both way are the same.
testNearZero(𝛁xTrain - inp.grad)
testNearZero(𝛁w1     - w1a.grad)
testNearZero(𝛁b1     - b1a.grad)
testNearZero(𝛁w2     - w2a.grad)
testNearZero(𝛁b2     - b2a.grad)

## Using the S4TF Language Integrated Autodiff

Let's compare to the language-integrated Swift for TensorFlow autodiff now. We have to mark the function as `@differentiable`.  This informs the compiler that we want it to automatically generate its gradients, and causes it to emit errors if there is anything contributing to the result of the function that cannot be differentiated.

The `@differentiable` attribute is normally optional in a S4TF standalone environment, but is currently required in Jupyter notebooks.  The S4TF team is planning to relax this limitation over time.

In [None]:
@differentiable
func forward(_ inp: TF, _ targ: TF, w1: TF, b1: TF, w2: TF, b2: TF) -> TF {
    let l1 = matmul(inp, w1) + b1
    let l2 = relu(l1)
    let l3 = matmul(l2, w2) + b2
    return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

Then we can ask for the gradients w.r.t. any individual parameter like this: (𝛁₂ is for second way of computing gradients, not gradients squared, or second order gradients)

In [None]:
let 𝛁₂xTrain = gradient(at: xTrain) {xTrain in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let 𝛁₂w1 = gradient(at: w1) {w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let 𝛁₂b1 = gradient(at: b1) {b1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let 𝛁₂w2 = gradient(at: w2) {w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let 𝛁₂b2 = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

// Check that they agree with the manually calculated gradients.
testNearZero(𝛁₂xTrain - 𝛁xTrain)
testNearZero(𝛁₂w1     - 𝛁w1)
testNearZero(𝛁₂b1     - 𝛁b1)
testNearZero(𝛁₂w2     - 𝛁w2)
testNearZero(𝛁₂b2     - 𝛁b2)

You can also ask for gradients with respect to multiple things at the same time, but unfortunately, current AD bugs prevent getting more than two gradients at a time.  We can do a little bit better than the above code like so:

In [None]:
let (𝛁₃xTrain, 𝛁₃w1) = gradient(at: xTrain, w1) {
    xTrain, w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)
}
let (𝛁₃b1, 𝛁₃w2) = gradient(at: b1, w2) {
    b1, w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)
}
let 𝛁₃b2 = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

// Check that they agree.
testNearZero(𝛁₂xTrain - 𝛁₃xTrain)
testNearZero(𝛁₂w1     - 𝛁₃w1)
testNearZero(𝛁₂b1     - 𝛁₃b1)
testNearZero(𝛁₂w2     - 𝛁₃w2)
testNearZero(𝛁₂b2     - 𝛁₃b2)

This is currently pretty ugly, and even when the bugs are fixed, it still won't be very idiomatic.  A more common thing is to wrap up all your parameters into a struct, and differentiate w.r.t. all of them at the same time (which, when we refactor the code, will be our model itself).

Here is an example of that:


In [None]:
struct myParams: Differentiable {
    public var x, w1, b1, w2, b2: TF
}

let allParams = myParams(x: xTrain, w1: w1, b1: b1, w2: w2, b2: b2)

In [None]:
// We can now get all of the gradients at once with a single call, and a single forward computation.
let grads = gradient(at: allParams) {
  allParams in
    forward(allParams.x, yTrainF,
            w1: allParams.w1, 
            b1: allParams.b1,
            w2: allParams.w2, 
            b2: allParams.b2)
}

// Check that this still calculates the same thing.
testNearZero(𝛁₂xTrain  - grads.x)
testNearZero(𝛁₂w1      - grads.w1)
testNearZero(𝛁₂b1      - grads.b1)
testNearZero(𝛁₂w2      - grads.w2)
testNearZero(𝛁₂b2      - grads.b2)

If you wanted the value for your loss as well as the gradients, you just have to use `valueWithGradient`.

In [None]:
let (loss,grads) = valueWithGradient(at: allParams) { 
    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

testNearZero(𝛁₂xTrain  - grads.x)
testNearZero(𝛁₂w1      - grads.w1)
testNearZero(𝛁₂b1      - grads.b1)
testNearZero(𝛁₂w2      - grads.w2)
testNearZero(𝛁₂b2      - grads.b2)

In terms of timing our implementation gives:

In [None]:
time(repeating: 10) { _ = forwardAndBackward(xTrain, yTrainF) }

average: 23.8023758 ms,   min: 22.71083 ms,   max: 24.415096 ms


In [None]:
time(repeating: 10) {
    _ = valueWithGradient(at: allParams) { 
        allParams in forward(allParams.x, 
                             yTrainF, 
                             w1: allParams.w1, 
                             b1: allParams.b1, 
                             w2: allParams.w2, 
                             b2: allParams.b2)
    }
}

average: 22.7956517 ms,   min: 21.799479 ms,   max: 24.44094 ms


### Refactor with valueWithPullback

Now one thing you will have noticed, is that in our forward and backward, we often ask for a value and the pullback at the same time, that's why it's often implemented together in the primitives of S4TF:

In [None]:
func reluWithPb(_ inp: TF) -> (TF, (TF) -> TF) {
    return (max(inp, 0), {𝛁out -> TF in
        (inp .> 0).selecting(𝛁out, TF(zeros: inp.shape))
    })
}

In [None]:
func linWithPb(_ inp: TF, _ w: TF, _ b: TF) -> (TF, (TF) -> (TF, TF, TF)) {
    return (inp • w + b, { 𝛁out in
        (𝛁out • w.transposed(), inp.transposed() • 𝛁out, 𝛁out.sum(squeezingAxes: 0))
    })
}

In [None]:
func mseWithPb(_ inp: TF, _ targ: TF) -> (TF, (TF) -> (TF)) {
    return ((inp.squeezingShape(at: -1) - targ).squared().mean(), { 𝛁loss in
        2.0 * (inp.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.shape[0])
    })
}

And then our forward and backward can be refactored in:

In [None]:
func forwardAndBackward(_ inp: TF, _ targ: TF) -> (TF, TF, TF, TF, TF){
    //forward pass:
    let (l1, pbL1)    = linWithPb(inp, w1, b1)
    let (l2, pbL2)    = reluWithPb(l1)
    let (out, pbOut)  = linWithPb(l2, w2, b2)
    //we don't actually need the loss in backward, but we need the pullback.
    let (loss, pbLoss) = mseWithPb(out, targ)
    
    //backward pass:
    let 𝛁loss = TF(1) //We don't really need it but the gradient of the loss with respect to itself is 1
    let 𝛁out = pbLoss(𝛁loss)
    let (𝛁l2, 𝛁w2, 𝛁b2) = pbOut(𝛁out)
    let 𝛁l1 = pbL2(𝛁l2)
    let (𝛁inp, 𝛁w1, 𝛁b1) = pbL1(𝛁l1)
    return (𝛁inp, 𝛁w1, 𝛁b1, 𝛁w2, 𝛁b2)
}

In [None]:
let (𝛁xTrain, 𝛁w1, 𝛁b1, 𝛁w2, 𝛁b2) = forwardAndBackward(xTrain, yTrainF)
// Check this is still all correct
testNearZero(𝛁₂xTrain - 𝛁xTrain)
testNearZero(𝛁₂w1     - 𝛁w1)
testNearZero(𝛁₂b1     - 𝛁b1)
testNearZero(𝛁₂w2     - 𝛁w2)
testNearZero(𝛁₂b2     - 𝛁b2)

### Export

In [None]:
import NotebookExport
let exporter = NotebookExport(Path.cwd/"02_fully_connected.ipynb")
print(exporter.export(usingPrefix: "FastaiNotebook_"))

success
