## Setup

In [1]:
# Make sure we're using the latest version
import Pkg
Pkg.activate("../..")  # Activate the project environment

Pkg.add("BenchmarkTools")
Pkg.add("LinearAlgebra")
Pkg.add("Statistics")
Pkg.add("Distributions")
Pkg.add("Random")
Pkg.add("Plots")
Pkg.add("MLDatasets")
Pkg.add("DataFrames")
Pkg.add("MLDataUtils")
Pkg.instantiate()      # Install any missing dependencies
Pkg.status()          # Check if MyMlp is listed


# Now try importing
using BenchmarkTools
using LinearAlgebra
using Distributions
using Random
using MLDatasets
using Plots
using Statistics
using DataFrames

[32m[1m  Activating[22m[39m project at `~/Repos/AWiD/MyMlp`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m      Compat[22m[39m entries added for 
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Manifest.toml`
[92m[1mPrecompiling[22m[39m project...
    998.8 ms[32m  ✓ [39m[90mLearnBase[39m
    954.8 ms[32m  ✓ [39m[90mMLLabelUtils[39m
   1178.2 ms[32m  ✓ [39m[90mMLDataPattern[39m
   3868.0 ms[32m  ✓ [39mDistributions
   1728.8 ms[32m  ✓ [39mMLDataUtils
   1136.4 ms[32m  ✓ [39mDistributions → DistributionsChainRulesCoreExt
   1150.2 ms[32m  ✓ [39mDistributions → DistributionsTestExt
   5027.6 ms[32m  ✓ [39m[90mMLUtils[39m
   6163.7 ms[32m  ✓ [39mMLDatasets
  43323.7 ms[32m  ✓ [39mPlots
   2582.3 ms[32m  ✓ [39mPlots → UnitfulExt
   2625.2 ms[32m  ✓ [39mPlots → FileIOExt
   4541.2 ms[32m  ✓ [39mMyMlp
  13 dependencies successfully precompiled in 54 seconds. 

[36m[1mProject[22m[39m MyMlp v0.1.0
[32m[1mStatus[22m[39m `~/Repos/AWiD/MyMlp/Project.toml`
  [90m[6e4b80f9] [39mBenchmarkTools v1.6.0
  [90m[a93c6f00] [39mDataFrames v1.7.0
  [90m[31c24e10] [39mDistributions v0.25.120
  [90m[cc2ba9b6] [39mMLDataUtils v0.5.4
  [90m[eb30cadb] [39mMLDatasets v0.7.18
  [90m[91a5bcdd] [39mPlots v1.40.13
  [90m[10745b16] [39mStatistics v1.11.1
  [90m[37e2e46d] [39mLinearAlgebra v1.11.0
  [90m[9a3f8284] [39mRandom v1.11.0


## Comparison of different optimizations to Variable

In [1]:
import Pkg
Pkg.add("BenchmarkTools")
using BenchmarkTools

abstract type GraphNode end
abstract type Operator <: GraphNode end

# Original implementation
mutable struct VariableOriginal <: GraphNode
    output :: Any
    grad :: Any
    name :: String
    VariableOriginal(output; name="?") = new(output, nothing, name)
end

# Optimized implementation
mutable struct VariableOptimized{T<:Float64} <: GraphNode
    output :: T
    grad :: Union{Nothing, T}
    name :: String
    VariableOptimized(output::T; name="?") where T<:Float64 = new{T}(output, nothing, name)
end

# RefValue-based immutable implementation
struct VariableRef <: GraphNode
    output :: Base.RefValue{Float64}
    grad   :: Union{Nothing, Base.RefValue{Float64}}
    name   :: String
    function VariableRef(output::Float64; name="?")
        new(Base.RefValue(output), nothing, name)
    end
end

# Functions to benchmark: reading
function read_output(v::VariableOriginal)
    v.output + 1.0
end

function read_output(v::VariableOptimized)
    v.output + 1.0
end

function read_output(v::VariableRef)
    v.output[] + 1.0
end

# Functions to benchmark: writing
function write_output!(v::VariableOriginal)
    v.output = v.output + 1.0
end

function write_output!(v::VariableOptimized)
    v.output = v.output + 1.0
end

function write_output!(v::VariableRef)
    v.output[] = v.output[] + 1.0
end

# Create instances
v1 = VariableOriginal(1.0)
v2 = VariableOptimized(1.0)
v3 = VariableRef(1.0)

# Benchmark READ
println("### Benchmark: READ access ###")
println("Original:")
@btime read_output($v1)

println("Optimized:")
@btime read_output($v2)

println("RefValue:")
@btime read_output($v3)

# Benchmark WRITE
println("\n### Benchmark: WRITE mutation ###")
println("Original:")
@btime write_output!($v1)

println("Optimized:")
@btime write_output!($v2)

println("RefValue:")
@btime write_output!($v3)



[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`


### Benchmark: READ access ###
Original:
  14.034 ns (1 allocation: 16 bytes)
Optimized:
  1.575 ns (0 allocations: 0 bytes)
RefValue:
  1.793 ns (0 allocations: 0 bytes)

### Benchmark: WRITE mutation ###
Original:
  14.417 ns (1 allocation: 16 bytes)
Optimized:
  2.013 ns (0 allocations: 0 bytes)
RefValue:
  2.013 ns (0 allocations: 0 bytes)


500503.0

## Next

In [11]:
import Base: *, +, clamp, log, exp
import LinearAlgebra: mul!
import Statistics: sum

abstract type GraphNode end
abstract type Operator <: GraphNode end

# Definition of basic structures for computational graph
mutable struct Constant{T<:Matrix{Float64}} <: GraphNode
    output :: T
end

mutable struct Variable{T<:Matrix{Float64}} <: GraphNode
    output :: T
    gradient :: Union{Nothing, T}
    name :: String
    
    Variable(output::T; name="?") where {T<:Matrix{Float64}} = new{T}(output, nothing, name)
end

mutable struct ScalarOperator{F} <: Operator
    inputs :: Union{Nothing, Tuple{GraphNode, GraphNode}}
    output :: Union{Nothing, Float64}
    gradient :: Union{Nothing, Float64}
    name :: String
    ScalarOperator(fun, inputs...; name="?") = new{typeof(fun)}(inputs, nothing, nothing, name)
end

mutable struct BroadcastedOperator{F} <: Operator
    inputs :: Union{Nothing, Tuple{GraphNode, GraphNode}, Tuple{GraphNode}}
    output :: Union{Nothing, Matrix{Float64}}
    gradient :: Union{Nothing, Matrix{Float64}}
    name :: String
    BroadcastedOperator(fun, inputs...; name="?") = new{typeof(fun)}(inputs, nothing, nothing, name)
end


import Base: show, summary
show(io::IO, x::ScalarOperator{F}) where {F} = print(io, "op ", x.name, "(", F, ")");
show(io::IO, x::BroadcastedOperator{F}) where {F} = print(io, "op.", x.name, "(", F, ")");
show(io::IO, x::Constant) = print(io, "const ", x.output)
show(io::IO, x::Variable) = begin
    print(io, "var ", x.name);
    print(io, "\n ┣━ ^ "); summary(io, x.output)
    print(io, "\n ┗━ ∇ ");  summary(io, x.gradient)
end


function visit(node::GraphNode, visited, order)
    if node ∈ visited
    else
        push!(visited, node)
        push!(order, node)
    end
    return nothing
end

function visit(node::Operator, visited, order)
    if node ∈ visited
    else
        push!(visited, node)
        for input in node.inputs
            visit(input, visited, order)
        end
        push!(order, node)
    end
    return nothing
end

function topological_sort(head::GraphNode)
    visited = Set()
    order = Vector()
    visit(head, visited, order)
    return order
end


# x * y (aka matrix multiplication)
*(A::GraphNode, x::GraphNode) = BroadcastedOperator(mul!, A, x)
forward(::BroadcastedOperator{typeof(mul!)}, A, x) = return A * x
backward(::BroadcastedOperator{typeof(mul!)}, A, x, g) = tuple(g * x', A' * g)

# relu activation
relu(x::GraphNode) = BroadcastedOperator(relu, x)
forward(::BroadcastedOperator{typeof(relu)}, x) = return x .* (x .> 0.0)
backward(::BroadcastedOperator{typeof(relu)}, x, g) = tuple(g .* (x .> 0.0), nothing)

# add operation (for bias)
+(x::GraphNode, y::GraphNode) = BroadcastedOperator(+, x, y)
forward(::BroadcastedOperator{typeof(+)}, x, y) = return x .+ y
backward(::BroadcastedOperator{typeof(+)}, x, y, g) = begin
    grad_wrt_x = g
    # Gradient biasu: sumowanie gradientu g wzdłuż wymiaru batcha (dims=1)
    # i przekształcenie wyniku 1xN do wektora N-elementowego.
    grad_wrt_y = sum(g, dims=2)
    return (grad_wrt_x, grad_wrt_y)
end

# sigmoid activation
σ(x::GraphNode) = BroadcastedOperator(σ, x)
forward(::BroadcastedOperator{typeof(σ)}, x) = return 1.0 ./ (1.0 .+ exp.(-x))
backward(node::BroadcastedOperator{typeof(σ)}, x, g) = begin
    y = node.output
    local_derivative = y .* (1.0 .- y)
    grad_wrt_x = g .* local_derivative
    return (grad_wrt_x, nothing)
end

# # binarycrossentropy
# binarycrossentropy(y::GraphNode, ŷ::GraphNode) = ScalarOperator(binarycrossentropy, y, ŷ)
# forward(::ScalarOperator{typeof(binarycrossentropy)}, ŷ, y) = begin
#     return -mean(y .* log.(ŷ) + (1.0 .- y) .* log.(1.0 .- ŷ))
# end
# backward(::ScalarOperator{typeof(binarycrossentropy)}, ŷ, y, g) = begin
#     J = (ŷ .- y) ./ (ŷ .* (1.0 .- ŷ))
#     return (J * g, nothing)
# end

function binary_cross_entropy_loss_impl(ŷ, y_true; epsilon=1e-10)
    ŷ_clamped = clamp.(ŷ, epsilon, 1.0 - epsilon)
    loss_elements = -y_true .* log.(ŷ_clamped) .- (1.0 .- y_true) .* log.(1.0 .- ŷ_clamped)
    return mean(loss_elements)
end

binarycrossentropy(ŷ::GraphNode, y::GraphNode) = ScalarOperator(binary_cross_entropy_loss_impl, ŷ, y)

forward(::ScalarOperator{typeof(binary_cross_entropy_loss_impl)}, ŷ_value, y_value) = begin
    loss_value = binary_cross_entropy_loss_impl(ŷ_value, y_value)
    return loss_value
end

backward(::ScalarOperator{typeof(binary_cross_entropy_loss_impl)}, ŷ_value, y_value, g) = begin
    epsilon = 1e-10
    ŷ_clamped_for_grad = clamp.(ŷ_value, epsilon, 1.0 - epsilon)
    local_grad_per_sample = (ŷ_clamped_for_grad .- y_value) ./ (ŷ_clamped_for_grad .* (1.0 .- ŷ_clamped_for_grad))
    batch_size = size(y_value, 2)
    grad_wrt_ŷ = local_grad_per_sample ./ batch_size
    return (grad_wrt_ŷ, nothing)
end


backward (generic function with 5 methods)

In [12]:
reset!(node::Constant) = nothing

reset!(node::Variable) = node.gradient = nothing
reset!(node::Operator) = node.gradient = nothing

compute!(node::Constant) = nothing
compute!(node::Variable) = nothing

compute!(node::Operator) =
    node.output = forward(node, [input.output for input in node.inputs]...)

function forward!(order::Vector)
    #   Iteruje przez każdy węzeł w order.
    for node in order
        compute!(node)
        reset!(node)
    end
    return last(order).output
end

forward! (generic function with 1 method)

In [13]:
update!(node::Constant, gradient) = nothing

update!(node::GraphNode, gradient) = if isnothing(node.gradient)
    node.gradient = gradient else node.gradient .+= gradient
end


function backward!(order::Vector; seed=1.0)
    result = last(order)   #   The output node

    if isa(result.output, Matrix{Float64})
        result.gradient = ones(Float64, size(result.output))
    else
        result.gradient = seed
        @assert length(result.output) == 1 "Gradient is defined only for scalar functions"
    end

    for node in reverse(order)   #   Iterate through nodes in reverse topological order.
        backward!(node)   #   Compute and propagate gradients backwards.
    end
    return nothing
end

function backward!(node::Constant) end
function backward!(node::Variable) end

function backward!(node::Operator)
    inputs = node.inputs

    gradients = backward(node, [input.output for input in inputs]..., node.gradient)

    for (input, gradient) in zip(inputs, gradients)
        update!(input, gradient)
    end
    return nothing
end

backward! (generic function with 4 methods)

## Funkcja Xavier

In [14]:
function xavier_uniform(size::Tuple{Int, Int})
    limit = sqrt(6.0 / (size[1] + size[2]))
    return rand(Uniform(-limit, limit), size)
end

function xavier_normal(size::Tuple{Int, Int})
    limit = sqrt(2.0 / (size[1] + size[2]))
    return rand(Normal(0.0, limit), size)
end

function xavier_uniform!(w::Matrix{Float64})
    fan_out, fan_in = size(w)
    limit = sqrt(6.0 / (fan_in + fan_out))
    rand!(Uniform(-limit, limit), w)
end

function xavier_normal!(w::Matrix{Float64})
    fan_out, fan_in = size(w)
    limit = sqrt(2.0 / (fan_in + fan_out))
    rand!(Normal(0.0, limit), w)
end

xavier_uniform((3, 2))  # Example usage
w1 = Variable(xavier_normal((3, 2)); name="w1")

w2_mat = zeros(Float64, (3, 2))
w2 = Variable(w2_mat; name="w2")
xavier_uniform!(w2.output)
xavier_normal!(w2.output)
w2.output  # Check the result

3×2 Matrix{Float64}:
  0.134939   0.115107
 -0.610532  -0.211263
 -0.423269  -0.988555

##  Funkcje dostępowe do wag, biasów oraz ich gradientów

In [16]:
function get_weights(order::Vector)
    weights = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("w", node.name)
                push!(weights, (node.name, node))
            end
        end
    end
    return weights
end

function get_biases(order::Vector)
    biases = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("b", node.name)
                push!(biases, (node.name, node))
            end
        end
    end
    return biases
end

function get_weights_and_biases(order::Vector)
    parameters = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("w", node.name) || occursin("b", node.name)
                push!(parameters, (node.name, node))
            end
        end
    end
    return parameters
end

function get_gradients(order::Vector)
    gradients = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("w", node.name) || occursin("b", node.name)
                push!(gradients, (node.name, node))
            end
        end
    end
    return gradients
end

get_gradients (generic function with 1 method)

## Optymalizator ADAM

In [None]:
mutable struct Adam
    α :: Float64    # learning rate
    β1 :: Float64   # First moment decay rate
    β2 :: Float64   # Second moment decay rate
    ε :: Float64    # Epsilon for numerical stability

    #   Stan optymalizatora
    m :: Dict{String, Matrix{Float64}}  # First moment estimate
    v :: Dict{String, Matrix{Float64}}  # Second moment estimate
    t :: Int  # Time step

    #   Parametry optymalizatora
    parameters :: Vector{Tuple{String, Variable}}   #   Wszystkie parametry (wagi i biasy)
end

"""
    init!(network_order::Vector; α=0.001, β1=0.9, β2=0.999, ϵ=1e-8)

Inicjalizuje optymalizator Adam dla sieci neuronowej.

## Argumenty:
- `network_order`: wektor węzłów sieci (z funkcji forward)
- `α`: współczynnik uczenia (domyślnie 0.001)
- `β1`: współczynnik zaniku dla pierwszego momentu (domyślnie 0.9)
- `β2`: współczynnik zaniku dla drugiego momentu (domyślnie 0.999)
- `ϵ`: epsilon dla stabilności numerycznej (domyślnie 1e-8)

## Zwraca:
- Zainicjalizowany obiekt optymalizatora Adam
"""

function init!(order::Vector{Any}, α=0.001, β1=0.9, β2=0.999, ε=1e-8)
    parameters = get_weights_and_biases(order)
    
    m = Dict{String, Matrix{Float64}}()
    v = Dict{String, Matrix{Float64}}()
    for (name, var) in parameters
        m[name] = zeros(Float64, size(var.output))
        v[name] = zeros(Float64, size(var.output))
    end
    return Adam(α, β1, β2, ε, m, v, 0, parameters)
end

"""
    step!(optimizer::Adam)

Wykonuje krok optymalizacji dla wszystkich parametrów sieci.

## Argumenty:
- `optimizer`: zainicjalizowany optymalizator Adam

## Zwraca:
- nic (modyfikuje parametry sieci in-place)
"""

function step!(optimizer::Adam)
    optimizer.t += 1

    for (name, var) in optimizer.parameters
        g = var.gradient

        #   Aktualizuj momenty
        optimizer.m[name] = optimizer.β1 * optimizer.m[name] + (1 - optimizer.β1) * g
        optimizer.v[name] = optimizer.β2 * optimizer.v[name] + (1 - optimizer.β2) * (g .^ 2)

        #   Popraw momenty
        m_corrected = optimizer.m[name] / (1 - optimizer.β1 ^ optimizer.t)
        v_corrected = optimizer.v[name] / (1 - optimizer.β2 ^ optimizer.t)

        #   Aktualizuj parametry
        var.output .-= optimizer.α .* m_corrected ./ (sqrt.(v_corrected) .+ optimizer.ε)
    end
end

"""
    reset!(optimizer::Adam)

Resetuje stan optymalizatora (momenty i licznik czasu).

## Argumenty:
- `optimizer`: optymalizator Adam do zresetowania

## Zwraca:
- nic (modyfikuje optymalizator in-place)
"""

function reset!(optimizer::Adam)
    optimizer.t = 0
    #  Reset momentów
    for (name, var) in optimizer.parameters
        optimizer.m[name] .= zeros(size(var.output))
        optimizer.v[name] .= zeros(size(var.output))
    end
end

"""
    get_state(optimizer::Adam)

Zwraca aktualny stan optymalizatora (przydatne do zapisywania checkpointów).

## Argumenty:
- `optimizer`: optymalizator Adam

## Zwraca:
- Dict zawierający stan optymializatora (licznik czasu i momenty)
"""
function get_state(optimizer::Adam)
    return Dict(
        "t" => optimizer.t,
        "m" => deepcopy(optimizer.m),
        "v" => deepcopy(optimizer.v)
    )
end

"""
    set_state!(optimizer::Adam, state::Dict)

Ustawia stan optymalizatora z zapisanego stanu.

## Argumenty:
- `optimizer`: optymalizator Adam
- `state`: stan optymalizatora (z funkcji get_state)

## Zwraca:
- nic (modyfikuje optymalizator in-place)
"""
function set_state!(optimizer::Adam, state::Dict)
    optimizer.t = state["t"]
    optimizer.m = state["m"]
    optimizer.v = state["v"]
end

set_state!

## Test wejścia do neuronu

In [7]:
x = Variable(reshape([1.0, 2.0, 3.0, 1.0, 2.0, 3.0], 3, 2), name="x")
x.output

3×2 Matrix{Float64}:
 1.0  1.0
 2.0  2.0
 3.0  3.0

In [8]:
w = Variable([1.0 2.0 3.0], name="w")
w.output

1×3 Matrix{Float64}:
 1.0  2.0  3.0

In [9]:
z = w * x
z.name = "z"

"z"

In [10]:
order = topological_sort(z)
println("Topological order:")
order
weights = get_weights(order)


Topological order:


1-element Vector{Tuple{String, Matrix{Float64}}}:
 ("w", [1.0 2.0 3.0])

In [20]:
w.output .+= 1.0
weights

1-element Vector{Tuple{String, Matrix{Float64}}}:
 ("w", [11.0 12.0 13.0])

In [13]:
y = forward!(order)
z.output

1×2 Matrix{Float64}:
 14.0  14.0

In [10]:
backward!(order)
w.gradient

1×3 Matrix{Float64}:
 2.0  4.0  6.0

## Test 2 szeregowych Neuronów - 1 warstwa + bias


In [5]:
x = Variable([1.0 1.0; 2.0 2.0; 3.0 3.0], name="x")
w = Variable([2.0 4.0 6.0; 3.0 5.0 7.0], name="w")
y = Constant(reshape([1.0, 1.0], 1, 2))
z = w * x
z.name = "z"
# c = Constant(1.0)
# d = z + c
# dense_layer_2 = σ(z)
# dense_layer_2.name = "σ(z)"
dense_layer_2 = relu(z)
dense_layer_2.name = "relu(z)"
loss = binarycrossentropy(dense_layer_2, y)
loss.name = "binarycrossentropy"
order = topological_sort(loss)
y = forward!(order)
backward!(order)

## Test 2. warstw neuronów 2-4

In [None]:
#   Pierwsza warstwa
x = Variable(reshape([1.0, 2.0, 3.0], 3, 1), name="x")
w = Variable(reshape([2.0, 3.0, 4.0, 5.0, 6.0, 7.0], 2, 3), name="w1")
a = w * x
a.name = "a"
b = relu(a)
b.name = "b"

#   Druga warstwa
w2 = Variable(reshape([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], 4,2), name="w2")
c = w2 * b
c.name = "c"
d = relu(c)
d.name = "d"
order = topological_sort(d)

7-element Vector{Any}:
 var w2
 ┣━ ^ 4×2 Matrix{Float64}
 ┗━ ∇ Nothing
 var w1
 ┣━ ^ 2×3 Matrix{Float64}
 ┗━ ∇ Nothing
 var x
 ┣━ ^ 3×1 Matrix{Float64}
 ┗━ ∇ Nothing
 op.a(typeof(mul!))
 op.b(typeof(relu))
 op.c(typeof(mul!))
 op.d(typeof(relu))

In [12]:
ŷ = forward!(order)
backward!(order)

## Test binary cross entropy

In [5]:
ŷ = Variable(reshape([0.8], 1, 1), name="ŷ")
y = Variable(reshape([1.0], 1, 1), name="y")
loss = binarycrossentropy(ŷ, y)
loss.name = "loss"
order = topological_sort(loss)
result = forward!(order)


0.2231435513142097

In [6]:
backward!(order)

##  Test tworzenia modelu dla batch = 2 relu-sigmoid-bce

In [7]:
x = Constant([1.0 1.0; 2.0 1.0; 3.0 1.0])
w1 = Variable([0.1 0.2 0.3; 0.4 0.5 0.6], name="w1")
z1_mul = w1 * x
z1_mul.name = "z1_mul"


b1_matrix = zeros(Float64, 2, 1)
b1_matrix[1,1] = 0.1
b1_matrix[2,1] = 0.2
b1 = Variable(b1_matrix, name="b1")
z1 = z1_mul + b1
z1.name = "z1"

a1 = relu(z1)
a1.name = "a1"

w2_matrix = zeros(Float64, 1, 2)
w2_matrix[1,1] = 0.5
w2_matrix[1,2] = -0.5
w2 = Variable(w2_matrix, name="w2")
z2_mul = w2 * a1
z2_mul.name = "z2_mul"

b2_matrix = zeros(Float64, 1, 1)
b2_matrix[1,1] = 0.0
b2 = Variable(b2_matrix, name="b2")
z2 = z2_mul + b2

ŷ = σ(z2)
ŷ.name = "ŷ"

y_matrix = zeros(Float64, 1, 2)
y_matrix[1,1] = 1.0
y_matrix[1,2] = 0.0
y = Constant(y_matrix)

loss = binarycrossentropy(ŷ, y)
loss.name = "loss"

"loss"

In [8]:
order = topological_sort(loss)

13-element Vector{Any}:
 var w2
 ┣━ ^ 1×2 Matrix{Float64}
 ┗━ ∇ Nothing
 var w1
 ┣━ ^ 2×3 Matrix{Float64}
 ┗━ ∇ Nothing
 const [1.0 1.0; 2.0 1.0; 3.0 1.0]
 op.z1_mul(typeof(mul!))
 var b1
 ┣━ ^ 2×1 Matrix{Float64}
 ┗━ ∇ Nothing
 op.z1(typeof(+))
 op.a1(typeof(relu))
 op.z2_mul(typeof(mul!))
 var b2
 ┣━ ^ 1×1 Matrix{Float64}
 ┗━ ∇ Nothing
 op.?(typeof(+))
 op.ŷ(typeof(σ))
 const [1.0 0.0]
 op loss(typeof(binary_cross_entropy_loss_impl))

In [9]:
result = forward!(order)

0.8755166955155294

In [13]:
w2.output

1×2 Matrix{Float64}:
 0.5  -0.5

In [10]:
backward!(order)

In [15]:
get_weights(order)
get_biases(order)
get_gradients(order)

4-element Vector{Tuple{String, Variable}}:
 ("w2", var w2
 ┣━ ^ 1×2 Matrix{Float64}
 ┗━ ∇ 1×2 Matrix{Float64})
 ("w1", var w1
 ┣━ ^ 2×3 Matrix{Float64}
 ┗━ ∇ 2×3 Matrix{Float64})
 ("b1", var b1
 ┣━ ^ 2×1 Matrix{Float64}
 ┗━ ∇ 2×1 Matrix{Float64})
 ("b2", var b2
 ┣━ ^ 1×1 Matrix{Float64}
 ┗━ ∇ 1×1 Matrix{Float64})

## Iris Test

In [None]:
using MLDatasets
using Random
using LinearAlgebra
using Plots
using DataFrames # Dodajmy pakiet DataFrame, bo wygląda na to, że jest używany
using MLDataUtils

# Załaduj zbiór danych Iris
iris_features, iris_targets = Iris(as_df=false)[:]
class1_name = "Iris-setosa"
class2_name = "Iris-versicolor"
iris_features_cut = iris_features[:, 1:100];
iris_targets_cut = iris_targets[:, 1:100];

label_mapping = Dict("Iris-setosa" => 0.0, "Iris-versicolor" => 1.0)
iris_targets_cut_classes = [label_mapping[class_name] for class_name in iris_targets_cut]
iris_shuffled_x, iris_shuffled_y = shuffleobs((iris_features_cut, iris_targets_cut_classes));
batch_size = 10
num_observations = size(iris_shuffled_x, 2)

batches = []

for i = 1:batch_size:num_observations
    end_index = min(i+batch_size-1, num_observations)
    features_batch = iris_shuffled_x[:, i:end_index]
    targets_batch = iris_shuffled_y[i:end_index]
    targets_batch = reshape(targets_batch, 1, length(targets_batch))
    push!(batches, (features_batch, targets_batch))
end

iris_train = batches[1:8]
iris_test = batches[9:end]

(features = [5.1 4.9 … 6.2 5.9; 3.5 3.0 … 3.4 3.0; 1.4 1.4 … 5.4 5.1; 0.2 0.2 … 2.3 1.8], targets = InlineStrings.String15["Iris-setosa" "Iris-setosa" … "Iris-virginica" "Iris-virginica"])

In [None]:
#   Ustawienia sieci neuronowej
features = 4
hidden = 8
out = 1
epochs = 10
batch_size = 10

x = Constant(zeros(Float64, features, batch_size))
w1 = Variable(xavier_uniform((hidden, features)); name="w1")
z1_mul = w1 * x
z1_mul.name = "z1_mul"
d1 = relu(z1_mul)
d1.name = "d1"
w2 = Variable(xavier_uniform((out, hidden)); name="w2")
z2_mul = w2 * d1
z2_mul.name = "z2_mul"
ŷ = σ(z2_mul)
ŷ.name = "ŷ"
y = Constant(zeros(Float64, out, batch_size))
loss = binarycrossentropy(ŷ, y)
loss.name = "loss"
order = topological_sort(loss)


9-element Vector{Any}:
 var w2
 ┣━ ^ 1×8 Matrix{Float64}
 ┗━ ∇ Nothing
 var w1
 ┣━ ^ 8×4 Matrix{Float64}
 ┗━ ∇ Nothing
 const [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 op.z1_mul(typeof(mul!))
 op.d1(typeof(relu))
 op.z2_mul(typeof(mul!))
 op.ŷ(typeof(σ))
 const [0.0 0.0 … 0.0 0.0]
 op loss(typeof(binary_cross_entropy_loss_impl))

In [None]:
#   Start ADAM   
optimizer = init!(order)
println("Initial state of optimizer:")
println(get_state(optimizer))

Initial state of optimizer:
Dict{String, Any}("v" => Dict("w2" => [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0], "w1" => [0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0]), "m" => Dict("w2" => [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0], "w1" => [0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0]), "t" => 0)


In [26]:
#   Run training and visualize results
for epoch in 1:epochs
    println("Epoch $epoch")
    for (features_batch, targets_batch) in iris_train
        x.output = features_batch
        y.output = targets_batch
        forward!(order)
        loss_value = loss.output
        backward!(order)
        step!(optimizer)
    end
    println("Loss after epoch $epoch: ", loss_value)
end

Epoch 1


ErrorException: setfield!: immutable struct of type Adam cannot be changed