In [1]:
using Flux
using Flux: train!, @epochs
using Flux.Tracker: gradient, update!
using Printf

# `gradient`

Returns the "rate of change" of the function at the given values.

In [2]:
f(x) = 3x^2 + 2x + 1
df(x) = gradient(f, x; nest=true)[1] # 6x + 2
d2f(x) = gradient(df, x)[1]          # 6

d2f (generic function with 1 method)

In [3]:
for i in 1:10
    @printf("%4d %20s\n", i, df(i))
end

   1        8.0 (tracked)
   2       14.0 (tracked)
   3       20.0 (tracked)
   4       26.0 (tracked)
   5       32.0 (tracked)
   6       38.0 (tracked)
   7       44.0 (tracked)
   8       50.0 (tracked)
   9       56.0 (tracked)
  10       62.0 (tracked)


In [4]:
for i in 1:10
    @printf("%4d %20s\n", i, d2f(i))
end

   1        6.0 (tracked)
   2        6.0 (tracked)
   3        6.0 (tracked)
   4        6.0 (tracked)
   5        6.0 (tracked)
   6        6.0 (tracked)
   7        6.0 (tracked)
   8        6.0 (tracked)
   9        6.0 (tracked)
  10        6.0 (tracked)


In [5]:
f(a, b) = a*b
# This will flip the arguments, because the "rate of change" of each depends on the other.
gradient(f, 3, 7)

(7.0 (tracked), 3.0 (tracked))

# Linear Regression Example

Here we train a "neural network" with 5 inputs and 2 outputs. There are no hidden layers, so I'm not sure if this is actually considered a neural network.

Each of the two outputs is a weighted combination of the inputs plus a bias.

We are training on a single data point of 5 parameters (`x`) and a single target of 2 parameters (`y`). Thus, we should be able to fit the model perfectly and get a loss very close to zero.

In [6]:
W = rand(2, 5)
b = rand(2)

predict(x) = W*x .+ b

function loss(x, y)
    ŷ = predict(x)
    sum((y .- ŷ).^2)
end

x, y = rand(5), rand(2)

loss(x, y)

1.046756570082112

In [7]:
W = param(W)
b = param(b)

Tracked 2-element Array{Float64,1}:
 0.3948092380303805
 0.5967093463697652

In [8]:
for epoch in 1:10
    gradients = gradient(() -> loss(x, y), params(W, b))

    update!(W, -0.1*gradients[W])
    update!(b, -0.1*gradients[b])

    @printf("%.9f\n", loss(x, y))
end

0.451586131
0.194820878
0.084048583
0.036259791
0.015643005
0.006748622
0.002911454
0.001256044
0.000541876
0.000233773


# Julia Closures

Julia closures are a little different. You need to know this to understand some of these examples.

In [9]:
function adder(n)
    x -> n + x
end

add6 = adder(6)
println(add6(5))
println(add6.n)  # Note that the variables in the closure are available as attributes of the function.

11
6


In [10]:
struct StructAdder
    n
end

(s::StructAdder)(x) = s.n + x

add6 = StructAdder(6)
println(add6(5))
println(add6.n)

11
6


# Layers

Here we build and train a `5 * 3 * 2` neural network, with a sigmoid function. Again, we're training on a single data point.

In [11]:
function linear(in, out)
    W = param(randn(out, in))
    b = param(randn(out))
    x -> W*x .+ b
end

linear1 = linear(5, 3)
linear2 = linear(3, 2)

model(x) = linear2(σ.(linear1(x)))

model(rand(5))

Tracked 2-element Array{Float64,1}:
  0.3517323725917125 
 -0.23742354870093552

In [12]:
function loss(x, y)
    ŷ = model(x)
    sum((y .- ŷ).^2)
end

x, y = randn(5), randn(2)

loss(x, y)

1.0733103791941518 (tracked)

In [13]:
for epoch in 1:10
    gradients = gradient(() -> loss(x, y), params(linear1.W, linear1.b, linear2.W, linear2.b))
    
    update!(linear1.W, -0.1*gradients[linear1.W])
    update!(linear1.b, -0.1*gradients[linear1.b])
    update!(linear2.W, -0.1*gradients[linear2.W])
    update!(linear2.b, -0.1*gradients[linear2.b])
    
    @printf("%.9f\n", loss(x, y))
end

0.274020578
0.069499191
0.017516294
0.004416267
0.001118774
0.000285316
0.000073272
0.000018939
0.000004923
0.000001286


# `train!`

https://fluxml.ai/Flux.jl/stable/training/training/

In [14]:
x, y = randn(5), randn(2)

loss(x, y)

2.9954732270380444 (tracked)

In [15]:
for epoch in 1:10
    train!(loss, params(linear1.W, linear1.b, linear2.W, linear2.b), [(x, y)], Descent())
    @printf("%.9f\n", loss(x, y))
end 

1.181943397
0.482464323
0.201011953
0.084782366
0.036028257
0.015382719
0.006588213
0.002827619
0.001215429
0.000523031


# `Chain`

In [16]:
model2 = Chain(
    Dense(10, 5, relu),
    Dense(5, 2, relu),
    Dense(2, 2))

Chain(Dense(10, 5, NNlib.relu), Dense(5, 2, NNlib.relu), Dense(2, 2))

In [17]:
x, y = randn(10, 15), randn(2, 15)

loss2(x, y) = Flux.mse(model2(x), y)

loss2 (generic function with 1 method)

In [18]:
@printf("%.9f\n", loss2(x, y))

for epoch in 1:10
    train!(loss2, params(model2), [(x, y)], ADAM())
    @printf("%.9f\n", loss2(x, y))
end

2.813097557
2.785530979
2.758332369
2.731498482
2.705067583
2.679022002
2.653334964
2.628001345
2.603012662
2.578365758
2.554058211
