## Setup

In [10]:
# Make sure we're using the latest version
import Pkg
Pkg.activate("../..")  # Activate the project environment

Pkg.add("BenchmarkTools")
Pkg.add("LinearAlgebra")
Pkg.add("Statistics")
Pkg.add("Distributions")
Pkg.add("Random")
Pkg.add("Plots")
Pkg.add("MLDatasets")
Pkg.add("DataFrames")
Pkg.add("MLDataUtils")
Pkg.add("JLD2")
Pkg.add("UUIDs")
Pkg.instantiate()      # Install any missing dependencies
Pkg.status()          # Check if MyMlp is listed


[32m[1m  Activating[22m[39m project at `~/Repos/AWiD/MyMlp`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m      Compat[22m[39m entries added for 
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Manifest.toml`
[92m[1mPrecompiling[22m[39m project...
   6144.9 ms[32m  ✓ [39mMyMlp
  1 dependency successfully precompiled in 8 seconds. 337 already precompiled.
[32m[1m   Resolving[22m[39m package versions...
[32m[1m      Compat[22m[39m entries added for 
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m      Compat[22m[39m entries added for 
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Repos/AWiD/MyMlp/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32

[36m[1mProject[22m[39m MyMlp v0.1.0
[32m[1mStatus[22m[39m `~/Repos/AWiD/MyMlp/Project.toml`
  [90m[6e4b80f9] [39mBenchmarkTools v1.6.0
  [90m[a93c6f00] [39mDataFrames v1.7.0
  [90m[31c24e10] [39mDistributions v0.25.120
  [90m[033835bb] [39mJLD2 v0.5.13
  [90m[cc2ba9b6] [39mMLDataUtils v0.5.4
  [90m[eb30cadb] [39mMLDatasets v0.7.18
  [90m[91a5bcdd] [39mPlots v1.40.13
  [90m[10745b16] [39mStatistics v1.11.1
  [90m[37e2e46d] [39mLinearAlgebra v1.11.0
  [90m[9a3f8284] [39mRandom v1.11.0
  [90m[cf7118a7] [39mUUIDs v1.11.0


In [1]:
# Now try importing
using BenchmarkTools
using LinearAlgebra
using Distributions
using Random
using MLDatasets
using Plots
using Statistics
using DataFrames
using JLD2
using UUIDs
using Printf
using MLDataUtils

## Comparison of different optimizations to Variable

In [1]:
import Pkg
Pkg.add("BenchmarkTools")
using BenchmarkTools

abstract type GraphNode end
abstract type Operator <: GraphNode end

# Original implementation
mutable struct VariableOriginal <: GraphNode
    output :: Any
    grad :: Any
    name :: String
    VariableOriginal(output; name="?") = new(output, nothing, name)
end

# Optimized implementation
mutable struct VariableOptimized{T<:Float64} <: GraphNode
    output :: T
    grad :: Union{Nothing, T}
    name :: String
    VariableOptimized(output::T; name="?") where T<:Float64 = new{T}(output, nothing, name)
end

# RefValue-based immutable implementation
struct VariableRef <: GraphNode
    output :: Base.RefValue{Float64}
    grad   :: Union{Nothing, Base.RefValue{Float64}}
    name   :: String
    function VariableRef(output::Float64; name="?")
        new(Base.RefValue(output), nothing, name)
    end
end

# Functions to benchmark: reading
function read_output(v::VariableOriginal)
    v.output + 1.0
end

function read_output(v::VariableOptimized)
    v.output + 1.0
end

function read_output(v::VariableRef)
    v.output[] + 1.0
end

# Functions to benchmark: writing
function write_output!(v::VariableOriginal)
    v.output = v.output + 1.0
end

function write_output!(v::VariableOptimized)
    v.output = v.output + 1.0
end

function write_output!(v::VariableRef)
    v.output[] = v.output[] + 1.0
end

# Create instances
v1 = VariableOriginal(1.0)
v2 = VariableOptimized(1.0)
v3 = VariableRef(1.0)

# Benchmark READ
println("### Benchmark: READ access ###")
println("Original:")
@btime read_output($v1)

println("Optimized:")
@btime read_output($v2)

println("RefValue:")
@btime read_output($v3)

# Benchmark WRITE
println("\n### Benchmark: WRITE mutation ###")
println("Original:")
@btime write_output!($v1)

println("Optimized:")
@btime write_output!($v2)

println("RefValue:")
@btime write_output!($v3)



[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`


### Benchmark: READ access ###
Original:
  14.034 ns (1 allocation: 16 bytes)
Optimized:
  1.575 ns (0 allocations: 0 bytes)
RefValue:
  1.793 ns (0 allocations: 0 bytes)

### Benchmark: WRITE mutation ###
Original:
  14.417 ns (1 allocation: 16 bytes)
Optimized:
  2.013 ns (0 allocations: 0 bytes)
RefValue:
  2.013 ns (0 allocations: 0 bytes)


500503.0

## Next

In [2]:
import Base: *, +, clamp, log, exp
import LinearAlgebra: mul!
import Statistics: sum

abstract type GraphNode end
abstract type Operator <: GraphNode end

# Definition of basic structures for computational graph
mutable struct Constant{T<:Matrix{Float32}} <: GraphNode
    output :: T
end

mutable struct Variable{T<:Matrix{Float32}} <: GraphNode
    output :: T
    gradient :: T
    name :: String
    
    Variable(output::T; name="?") where {T<:Matrix{Float32}} = new{T}(output, zeros(Float32, size(output)), name)
end

mutable struct ScalarOperator{F} <: Operator
    inputs :: Tuple{GraphNode, GraphNode}
    output :: Float32
    gradient :: Float32
    name :: String
    ScalarOperator(fun, inputs...; name="?") = new{typeof(fun)}(inputs, 0.0f0, 0.0f0, name)
end

mutable struct BroadcastedOperator{F} <: Operator
    inputs :: Union{Tuple{GraphNode, GraphNode}, Tuple{GraphNode}}
    output :: Matrix{Float32}
    gradient :: Matrix{Float32}
    name :: String
    BroadcastedOperator(fun, inputs...; name="?") = new{typeof(fun)}(inputs, zeros(Float32, 1, 1), zeros(Float32, 1, 1), name)
end


import Base: show, summary
show(io::IO, x::ScalarOperator{F}) where {F} = print(io, "op ", x.name, "(", F, ")");
show(io::IO, x::BroadcastedOperator{F}) where {F} = print(io, "op.", x.name, "(", F, ")");
show(io::IO, x::Constant) = print(io, "const ", x.output)
show(io::IO, x::Variable) = begin
    print(io, "var ", x.name);
    print(io, "\n ┣━ ^ "); summary(io, x.output)
    print(io, "\n ┗━ ∇ ");  summary(io, x.gradient)
end


function visit(node::GraphNode, visited, order)
    if node ∈ visited
    else
        push!(visited, node)
        push!(order, node)
    end
    return zeros(Float32, 1, 1)
end

function visit(node::Operator, visited, order)
    if node ∈ visited
    else
        push!(visited, node)
        for input in node.inputs
            visit(input, visited, order)
        end
        push!(order, node)
    end
    return zeros(Float32, 1, 1)
end

function topological_sort(head::GraphNode)
    visited = Set()
    order = Vector()
    visit(head, visited, order)
    return order
end


# x * y (aka matrix multiplication)
*(A::GraphNode, x::GraphNode; name="mul") = BroadcastedOperator(mul!, A, x, name=name)
forward(::BroadcastedOperator{typeof(mul!)}, A, x) = return A * x
backward(::BroadcastedOperator{typeof(mul!)}, A, x, g) = tuple(g * x', A' * g)

# relu activation
relu(x::GraphNode; name="relu") = BroadcastedOperator(relu, x, name=name)
forward(::BroadcastedOperator{typeof(relu)}, x) = return x .* (x .> 0.0f0)
backward(::BroadcastedOperator{typeof(relu)}, x, g) = tuple(g .* (x .> 0.0f0), zeros(Float32, 1, 1))

# add operation (for bias)
+(x::GraphNode, y::GraphNode; name="sum") = BroadcastedOperator(+, x, y, name=name)
forward(::BroadcastedOperator{typeof(+)}, x, y) = return x .+ y
backward(::BroadcastedOperator{typeof(+)}, x, y, g) = begin
    grad_wrt_x = g
    grad_wrt_y = sum(g, dims=2)
    return (grad_wrt_x, grad_wrt_y)
end

# sigmoid activation
σ(x::GraphNode; name="sigmoid") = BroadcastedOperator(σ, x, name=name)
forward(::BroadcastedOperator{typeof(σ)}, x) = return 1.0f0 ./ (1.0f0 .+ exp.(-x))
backward(node::BroadcastedOperator{typeof(σ)}, x, g) = begin
    y = node.output
    local_derivative = y .* (1.0f0 .- y)
    grad_wrt_x = g .* local_derivative
    return (grad_wrt_x, zeros(Float32, 1, 1))
end

function binary_cross_entropy_loss_impl(ŷ, y_true; epsilon=1e-10)
    ŷ_clamped = clamp.(ŷ, epsilon, 1.0f0 - epsilon)
    loss_elements = -y_true .* log.(ŷ_clamped) .- (1.0f0 .- y_true) .* log.(1.0f0 .- ŷ_clamped)
    return mean(loss_elements)
end

binarycrossentropy(ŷ::GraphNode, y::GraphNode; name="bce_loss") = ScalarOperator(binary_cross_entropy_loss_impl, ŷ, y, name=name)

forward(::ScalarOperator{typeof(binary_cross_entropy_loss_impl)}, ŷ_value, y_value) = begin
    loss_value = binary_cross_entropy_loss_impl(ŷ_value, y_value)
    return loss_value
end

backward(::ScalarOperator{typeof(binary_cross_entropy_loss_impl)}, ŷ_value, y_value, g) = begin
    epsilon = 1e-10
    ŷ_clamped_for_grad = clamp.(ŷ_value, epsilon, 1.0f0 - epsilon)
    local_grad_per_sample = (ŷ_clamped_for_grad .- y_value) ./ (ŷ_clamped_for_grad .* (1.0f0 .- ŷ_clamped_for_grad))
    batch_size = size(y_value, 2)
    grad_wrt_ŷ = local_grad_per_sample ./ batch_size
    return (grad_wrt_ŷ, zeros(Float32, 1, 1))
end


backward (generic function with 5 methods)

In [3]:
reset!(node::Constant) = nothing
reset!(node::Variable) = node.gradient = zeros(Float32, size(node.output))

function reset!(node::Operator)
    if isa(node.output, Matrix{Float32})
        node.gradient = zeros(Float32, size(node.output))
    else
        node.gradient = 0.0f0
    end
end
#reset!(node::Operator) = node.gradient = zeros(Float32, size(node.output))

compute!(node::Constant) = nothing
compute!(node::Variable) = nothing

function compute!(node::Operator)
    node.output = forward(node, [input.output for input in node.inputs]...)
    if isa(node.output, Matrix{Float32})
        node.gradient = zeros(Float32, size(node.output))
    end
end
# compute!(node::Operator) =
#     node.output = forward(node, [input.output for input in node.inputs]...)

function forward!(order::Vector)
    #   Iteruje przez każdy węzeł w order.
    for node in order
        compute!(node)
        reset!(node)
    end
    return last(order).output
end

forward! (generic function with 1 method)

In [4]:
update!(node::Constant, gradient) = nothing

update!(node::GraphNode, gradient) = if isnothing(node.gradient)
    node.gradient = gradient else node.gradient .+= gradient
end

function backward!(order::Vector; seed=1.0)
    result = last(order)   #   The output node
    if all(iszero, result.gradient)
        if isa(result.output, Matrix{Float32})
            result.gradient = ones(Float32, size(result.output))
        else
            result.gradient = seed
            @assert length(result.output) == 1 "Gradient is defined only for scalar functions"
        end
    end

    for node in reverse(order)   #   Iterate through nodes in reverse topological order.
        backward!(node)   #   Compute and propagate gradients backwards.
    end
    return zeros(Float32, 1, 1)
end

function backward!(node::Constant) end
function backward!(node::Variable) end

function backward!(node::Operator)
    inputs = node.inputs

    gradients = backward(node, [input.output for input in inputs]..., node.gradient)

    for (input, gradient) in zip(inputs, gradients)
        update!(input, gradient)
    end
    return zeros(Float32, 1, 1)
end

backward! (generic function with 4 methods)

## Funkcja Xavier

In [5]:
function xavier_uniform(size::Tuple{Int, Int})
    limit = sqrt(6.0f0 / (size[1] + size[2]))
    return Float32.(rand(Uniform(-limit, limit), size))
end

function xavier_normal(size::Tuple{Int, Int})
    limit = sqrt(2.0f0 / (size[1] + size[2]))
    return Float32.(rand(Normal(0.0f0, limit), size))
end

function xavier_uniform!(w::Matrix{Float32})
    fan_out, fan_in = size(w)
    limit = sqrt(6.0f0 / (fan_in + fan_out))
    Float32.(rand!(Uniform(-limit, limit), w))
end

function xavier_normal!(w::Matrix{Float32})
    fan_out, fan_in = size(w)
    limit = sqrt(2.0f0 / (fan_in + fan_out))
    Float32.(rand!(Normal(0.0f0, limit), w))
end

xavier_normal! (generic function with 1 method)

##  Funkcje dostępowe do wag, biasów oraz ich gradientów

In [6]:
function get_weights(order::Vector)
    weights = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("w", node.name)
                push!(weights, (node.name, node))
            end
        end
    end
    return weights
end

function get_biases(order::Vector)
    biases = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("b", node.name)
                push!(biases, (node.name, node))
            end
        end
    end
    return biases
end

function get_weights_and_biases(order::Vector)
    parameters = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("w", node.name) || occursin("b", node.name)
                push!(parameters, (node.name, node))
            end
        end
    end
    return parameters
end

function get_gradients(order::Vector)
    gradients = Vector{Tuple{String, Variable}}()
    for node in order
        if isa(node, Variable)
            if occursin("w", node.name) || occursin("b", node.name)
                push!(gradients, (node.name, node))
            end
        end
    end
    return gradients
end

get_gradients (generic function with 1 method)

## Optymalizator ADAM

In [8]:
abstract type AbstractOptimizer end

struct Adam <: AbstractOptimizer
    α :: Float32    # learning rate
    β1 :: Float32   # First moment decay rate
    β2 :: Float32   # Second moment decay rate
    ε :: Float32    # Epsilon for numerical stability
end

Adam() = Adam(0.001f0, 0.9f0, 0.999f0, 1e-8)

mutable struct AdamState
    hyperparams :: Adam # Przechowuje konfigurację optymalizatora
    m :: Dict{String, Matrix{Float32}}
    v :: Dict{String, Matrix{Float32}}
    t :: Int
    parameters :: Vector{Tuple{String, Variable}}
end

function setup_optimizer(optimizer_config::AbstractOptimizer, model::Chain)
    trainable_vars = collect_model_parameters(model)
    m = Dict{String, Matrix{Float32}}()
    v = Dict{String, Matrix{Float32}}()
    for (name, var) in trainable_vars
        m[name] = zeros(Float32, size(var.output))
        v[name] = zeros(Float32, size(var.output))
    end
    return AdamState(optimizer_config, m, v, 0, trainable_vars)
end

function collect_model_parameters(model::Chain)
    all_params = Vector{Tuple{String, Variable}}()
    for layer in model.layers
        append!(all_params, collect_model_parameters(layer))
    end
    return all_params
end

function collect_model_parameters(layer::Dense)
    return [(layer.W.name, layer.W), (layer.b.name, layer.b)]
end

function step!(optimizer_state::AdamState)
    optimizer_state.t += 1

    config = optimizer_state.hyperparams # Dostęp do hyperparametrów z konfiguracji

    for (name, var) in optimizer_state.parameters
        g = var.gradient

        optimizer_state.m[name] = config.β1 * optimizer_state.m[name] + (1 - config.β1) * g
        optimizer_state.v[name] = config.β2 * optimizer_state.v[name] + (1 - config.β2) * (g .^ 2)

        m_corrected = optimizer_state.m[name] / (1 - config.β1 ^ optimizer_state.t)
        v_corrected = optimizer_state.v[name] / (1 - config.β2 ^ optimizer_state.t)

        var.output .-= config.α .* m_corrected ./ (sqrt.(v_corrected) .+ config.ε)
    end
end

function reset!(optimizer_state::AdamState)
    optimizer_state.t = 0
    #  Reset momentów
    for (name, var) in optimizer_state.parameters
        optimizer_state.m[name] .= zeros(size(var.output))
        optimizer_state.v[name] .= zeros(size(var.output))
    end
end


reset! (generic function with 4 methods)

## Higher level API

In [7]:
abstract type Layer end

mutable struct Dense <: Layer
    W::Variable
    b::Variable
    activation
    name::String
end

function Dense(in_features::Int, out_features::Int, activation=identity; 
    weight_init = xavier_uniform,
    bias_init = (dims) -> zeros(Float32, dims),
    name="dense")

    W = Variable(weight_init((out_features, in_features)); name="$(name)_w")

    b = Variable(bias_init((out_features, 1)); name="$(name)_b")

    return Dense(W, b, activation, name)
end

function (d::Dense)(x::GraphNode)

    multiplication_code = *(d.W, x, name="$(d.name)_mul")
    #   Dodanie biasu
    linear_output = +(multiplication_code, d.b, name="$(d.name)_add")
    # Przekazanie nazwy operatorowi aktywacji
    if d.activation == relu
        return relu(linear_output, name="$(d.name)_relu")
    elseif d.activation == σ
        return σ(linear_output, name="$(d.name)_sigmoid")
    else
        #   Użyj domyślnej nazwy
        try
             return d.activation(linear_output, name="$(d.name)_$(string(nameof(d.activation)))")
        catch
             return d.activation(linear_output)
        end
    end
end

mutable struct Chain
    layers::Vector{<:Layer}
end

Chain(layers...) = Chain([layers...])

function (c::Chain)(x::GraphNode)
    input = x
    for layer in c.layers
        input = layer(input)
    end
    return input
end

function build_graph!(model::Chain, loss_fn, input_node::GraphNode, label_node::GraphNode; loss_name="loss")

    model_output_node = model(input_node)
    loss_node = loss_fn(model_output_node, label_node; name=loss_name)

    if hasproperty(loss_node, :name)
        loss_node.name = loss_name
    end

    order = topological_sort(loss_node)

    return (loss_node, model_output_node, order)

end


build_graph! (generic function with 1 method)

## Test wejścia do neuronu

In [7]:
x = Variable(Float32.(reshape([1.0, 2.0, 3.0, 1.0, 2.0, 3.0], 3, 2)), name="x")
x.output

3×2 Matrix{Float32}:
 1.0  1.0
 2.0  2.0
 3.0  3.0

In [8]:
w = Variable(Float32.([1.0 2.0 3.0]), name="w")
w.output

1×3 Matrix{Float32}:
 1.0  2.0  3.0

In [9]:
z = w * x
z.name = "z"

"z"

In [10]:
order = topological_sort(z)
println("Topological order:")
order
weights = get_weights(order)


Topological order:


1-element Vector{Tuple{String, Variable}}:
 ("w", var w
 ┣━ ^ 1×3 Matrix{Float32}
 ┗━ ∇ 1×3 Matrix{Float32})

In [11]:
y = forward!(order)
z.output

1×2 Matrix{Float32}:
 14.0  14.0

In [12]:
backward!(order)

1×1 Matrix{Float32}:
 0.0

## Test 2 szeregowych Neuronów - 1 warstwa + bias


In [22]:
x = Variable(Float32.([1.0 1.0; 2.0 2.0; 3.0 3.0]), name="x")
w = Variable(Float32.([2.0 4.0 6.0; 3.0 5.0 7.0]), name="w")
y = Constant(Float32.(reshape([1.0, 1.0], 1, 2)))
z = w * x
z.name = "z"
# c = Constant(1.0)
# d = z + c
# dense_layer_2 = σ(z)
# dense_layer_2.name = "σ(z)"
dense_layer_2 = relu(z)
dense_layer_2.name = "relu(z)"
loss = binarycrossentropy(dense_layer_2, y)
loss.name = "binarycrossentropy"
order = topological_sort(loss)
y = forward!(order)
backward!(order)

1×1 Matrix{Float32}:
 0.0

## Test 2. warstw neuronów 2-4

In [8]:
#   Pierwsza warstwa
x = Variable(Float32.(reshape([1.0, 2.0, 3.0], 3, 1)), name="x")
w = Variable(Float32.(reshape([2.0, 3.0, 4.0, 5.0, 6.0, 7.0], 2, 3)), name="w1")
a = w * x
a.name = "a"
b = relu(a)
b.name = "b"

#   Druga warstwa
w2 = Variable(Float32.(reshape([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], 4,2)), name="w2")
c = w2 * b
c.name = "c"
d = relu(c)
d.name = "d"
order = topological_sort(d)

7-element Vector{Any}:
 var w2
 ┣━ ^ 4×2 Matrix{Float32}
 ┗━ ∇ 4×2 Matrix{Float32}
 var w1
 ┣━ ^ 2×3 Matrix{Float32}
 ┗━ ∇ 2×3 Matrix{Float32}
 var x
 ┣━ ^ 3×1 Matrix{Float32}
 ┗━ ∇ 3×1 Matrix{Float32}
 op.a(typeof(mul!))
 op.b(typeof(relu))
 op.c(typeof(mul!))
 op.d(typeof(relu))

In [9]:
ŷ = forward!(order)
backward!(order)

1×1 Matrix{Float32}:
 0.0

## Test binary cross entropy

In [10]:
ŷ = Variable(Float32.(reshape([0.8], 1, 1)), name="ŷ")
y = Variable(Float32.(reshape([1.0], 1, 1)), name="y")
loss = binarycrossentropy(ŷ, y)
loss.name = "loss"
order = topological_sort(loss)
result = forward!(order)


0.22314353f0

In [11]:
backward!(order)

1×1 Matrix{Float32}:
 0.0

##  Test tworzenia modelu dla batch = 2 relu-sigmoid-bce

In [27]:
x = Constant(Float32.([1.0 1.0; 2.0 1.0; 3.0 1.0]))
w1 = Variable(Float32.([0.1 0.2 0.3; 0.4 0.5 0.6]), name="w1")
z1_mul = w1 * x
z1_mul.name = "z1_mul"


b1_matrix = zeros(Float32, 2, 1)
b1_matrix[1,1] = 0.1f0
b1_matrix[2,1] = 0.2f0
b1 = Variable(b1_matrix, name="b1")
z1 = z1_mul + b1
z1.name = "z1"

a1 = relu(z1)
a1.name = "a1"

w2_matrix = zeros(Float32, 1, 2)
w2_matrix[1,1] = 0.5f0
w2_matrix[1,2] = -0.5f0
w2 = Variable(w2_matrix, name="w2")
z2_mul = w2 * a1
z2_mul.name = "z2_mul"

b2_matrix = zeros(Float32, 1, 1)
b2_matrix[1,1] = 0.0f0
b2 = Variable(b2_matrix, name="b2")
z2 = z2_mul + b2

ŷ = σ(z2)
ŷ.name = "ŷ"

y_matrix = zeros(Float32, 1, 2)
y_matrix[1,1] = 1.0f0
y_matrix[1,2] = 0.0f0
y = Constant(y_matrix)

loss = binarycrossentropy(ŷ, y)
loss.name = "loss"

"loss"

In [28]:
order = topological_sort(loss)

13-element Vector{Any}:
 var w2
 ┣━ ^ 1×2 Matrix{Float32}
 ┗━ ∇ Nothing
 var w1
 ┣━ ^ 2×3 Matrix{Float32}
 ┗━ ∇ Nothing
 const Float32[1.0 1.0; 2.0 1.0; 3.0 1.0]
 op.z1_mul(typeof(mul!))
 var b1
 ┣━ ^ 2×1 Matrix{Float32}
 ┗━ ∇ Nothing
 op.z1(typeof(+))
 op.a1(typeof(relu))
 op.z2_mul(typeof(mul!))
 var b2
 ┣━ ^ 1×1 Matrix{Float32}
 ┗━ ∇ Nothing
 op.?(typeof(+))
 op.ŷ(typeof(σ))
 const Float32[1.0 0.0]
 op loss(typeof(binary_cross_entropy_loss_impl))

In [29]:
result = forward!(order)

0.8755167f0

In [30]:
w2.output

1×2 Matrix{Float32}:
 0.5  -0.5

In [31]:
backward!(order)

In [32]:
get_weights(order)
get_biases(order)
get_gradients(order)

4-element Vector{Tuple{String, Variable}}:
 ("w2", var w2
 ┣━ ^ 1×2 Matrix{Float32}
 ┗━ ∇ 1×2 Matrix{Float32})
 ("w1", var w1
 ┣━ ^ 2×3 Matrix{Float32}
 ┗━ ∇ 2×3 Matrix{Float32})
 ("b1", var b1
 ┣━ ^ 2×1 Matrix{Float32}
 ┗━ ∇ 2×1 Matrix{Float32})
 ("b2", var b2
 ┣━ ^ 1×1 Matrix{Float32}
 ┗━ ∇ 1×1 Matrix{Float32})

## Iris Test

In [12]:
using MLDatasets
using Random
using LinearAlgebra
using Plots
using DataFrames # Dodajmy pakiet DataFrame, bo wygląda na to, że jest używany
using MLDataUtils

# Załaduj zbiór danych Iris
iris_features, iris_targets = Iris(as_df=false)[:]
class1_name = "Iris-setosa"
class2_name = "Iris-versicolor"
iris_features_cut = iris_features[:, 1:100];
iris_targets_cut = iris_targets[:, 1:100];

label_mapping = Dict("Iris-setosa" => 0.0, "Iris-versicolor" => 1.0)
iris_targets_cut_classes = [label_mapping[class_name] for class_name in iris_targets_cut]
iris_shuffled_all_x, iris_shuffled_all_y = shuffleobs((iris_features_cut, iris_targets_cut_classes));
# Podział na zbiór treningowy i testowy (np. 80% train, 20% test)
train_ratio = 0.8
num_all_obs = size(iris_shuffled_all_x, 2)
num_train_obs = floor(Int, num_all_obs * train_ratio)

X_train = Float32.(iris_shuffled_all_x[:, 1:num_train_obs])
y_train = Float32.(iris_shuffled_all_y[:, 1:num_train_obs]) # Zakładając, że y_train ma kształt (out, num_obs)

X_test = Float32.(iris_shuffled_all_x[:, num_train_obs+1:end])
y_test = Float32.(iris_shuffled_all_y[:, num_train_obs+1:end]) # Podobnie dla y_test

1×20 Matrix{Float32}:
 0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0  …  0.0  1.0  1.0  1.0  1.0  0.0  1.0

In [13]:
#   Ustawienia sieci neuronowej
features = 4
hidden = 8
out = 1
epochs = 30
batch_size = 10

x = Constant(zeros(Float32, features, batch_size))
w1 = Variable(xavier_uniform((hidden, features)); name="w1")
z1_mul = w1 * x
z1_mul.name = "z1_mul"
b1 = Variable(xavier_uniform((hidden, 1)); name="b1")
z1 = z1_mul + b1
z1.name = "z1"
d1 = relu(z1)
d1.name = "d1"
w2 = Variable(xavier_uniform((out, hidden)); name="w2")
z2_mul = w2 * d1
z2_mul.name = "z2_mul"
b2 = Variable(xavier_uniform((out, 1)); name="b2")
z2 = z2_mul + b2
z2.name = "z2"
ŷ = σ(z2)
ŷ.name = "ŷ"
y = Constant(zeros(Float32, out, batch_size))
loss = binarycrossentropy(ŷ, y)
loss.name = "loss"
order = topological_sort(loss)


13-element Vector{Any}:
 var w2
 ┣━ ^ 1×8 Matrix{Float32}
 ┗━ ∇ 1×8 Matrix{Float32}
 var w1
 ┣━ ^ 8×4 Matrix{Float32}
 ┗━ ∇ 8×4 Matrix{Float32}
 const Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 op.z1_mul(typeof(mul!))
 var b1
 ┣━ ^ 8×1 Matrix{Float32}
 ┗━ ∇ 8×1 Matrix{Float32}
 op.z1(typeof(+))
 op.d1(typeof(relu))
 op.z2_mul(typeof(mul!))
 var b2
 ┣━ ^ 1×1 Matrix{Float32}
 ┗━ ∇ 1×1 Matrix{Float32}
 op.z2(typeof(+))
 op.ŷ(typeof(σ))
 const Float32[0.0 0.0 … 0.0 0.0]
 op loss(typeof(binary_cross_entropy_loss_impl))

In [14]:
#   Start ADAM   
optimizer = init!(order)
println("Initial state of optimizer:")
println(get_state(optimizer))

Initial state of optimizer:
Dict{String, Any}("v" => Dict{String, Matrix{Float32}}("b2" => [0.0;;], "w2" => [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0], "w1" => [0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0], "b1" => [0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]), "m" => Dict{String, Matrix{Float32}}("b2" => [0.0;;], "w2" => [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0], "w1" => [0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0], "b1" => [0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;;]), "t" => 0)


In [17]:
num_training_samples = size(X_train, 2) # Liczba próbek w zbiorze treningowym


#   Run training and visualize results
loss_value = 0.0
for epoch in 1:epochs
    # --- Tasowanie zbioru treningowego NA NOWO w każdej epoce ---
    permutation = randperm(num_training_samples)
    X_train_shuffled_epoch = X_train[:, permutation]
    y_train_shuffled_epoch = y_train[:, permutation]
    num_batches = ceil(Int, num_training_samples / batch_size)

    println("Epoch $epoch")
    for i in 1:num_batches
        # Wybierz batch z POTASOWANYCH W TEJ EPOCE danych
        start_idx = (i - 1) * batch_size + 1
        end_idx = min(i * batch_size, num_training_samples)
        x_batch = X_train_shuffled_epoch[:, start_idx:end_idx]
        y_batch = y_train_shuffled_epoch[:, start_idx:end_idx]

        x.output = x_batch
        y.output = y_batch

        forward!(order)

        #println("ŷ: ", ŷ.output)
        #println("y: ", y.output)
    
        loss_value = loss.output
        backward!(order)
        step!(optimizer)

    end

    println("Loss after epoch $epoch: ", loss_value)
end

Epoch 1
Loss after epoch 1: 0.39245194
Epoch 2
Loss after epoch 2: 0.36798656
Epoch 3
Loss after epoch 3: 0.393301
Epoch 4
Loss after epoch 4: 0.365246
Epoch 5
Loss after epoch 5: 0.36020437
Epoch 6
Loss after epoch 6: 0.3220604
Epoch 7
Loss after epoch 7: 0.35850725
Epoch 8
Loss after epoch 8: 0.3821807
Epoch 9
Loss after epoch 9: 0.32759959
Epoch 10
Loss after epoch 10: 0.31206957
Epoch 11
Loss after epoch 11: 0.3031282
Epoch 12
Loss after epoch 12: 0.31560922
Epoch 13
Loss after epoch 13: 0.26405737
Epoch 14
Loss after epoch 14: 0.3200249
Epoch 15
Loss after epoch 15: 0.28471595
Epoch 16
Loss after epoch 16: 0.2944382
Epoch 17
Loss after epoch 17: 0.24712132
Epoch 18
Loss after epoch 18: 0.25892696
Epoch 19
Loss after epoch 19: 0.25536734
Epoch 20
Loss after epoch 20: 0.2164693
Epoch 21
Loss after epoch 21: 0.24449864
Epoch 22
Loss after epoch 22: 0.21425393
Epoch 23
Loss after epoch 23: 0.21903259
Epoch 24
Loss after epoch 24: 0.20544605
Epoch 25
Loss after epoch 25: 0.21182384
Epo

In [18]:
println("\n--- Starting Test Evaluation ---")

x.output = X_test
y.output = y_test

forward!(order)

predictions_prob = ŷ.output

predicted_classes = (predictions_prob .> 0.5) # Wynik to BitMatrix (1, liczba_próbek_testowych)

true_classes = convert.(Bool, y_test) # Wynik to BitMatrix (1, liczba_próbek_testowych)

# --- Obliczanie Dokładności ---
# Najprostsze obliczenie powinno działać poprawnie na BitMatrix
correct_predictions = sum(predicted_classes .== true_classes) # Sumuje true w macierzy wynikowej porównania element-wise

total_test_samples = size(X_test, 2)
accuracy = correct_predictions / total_test_samples

println("Test Accuracy: $(accuracy * 100.0) %")

# --- Obliczanie Macierzy Pomyłek (Confusion Matrix) - POPRAWIONE ---
# Używamy standardowych operatorów logicznych na macierzach Boolowskich
# TP: predicted = true AND true = true
TP = sum(predicted_classes .& true_classes)

# TN: predicted = false AND true = false
TN = sum(.!predicted_classes .& .!true_classes)

# FP: predicted = true AND true = false
FP = sum(predicted_classes .& .!true_classes)

# FN: predicted = false AND true = true
FN = sum(.!predicted_classes .& true_classes)

println("\nConfusion Matrix:")
println("TP: $(TP), TN: $(TN), FP: $(FP), FN: $(FN)")

# Ważna weryfikacja: Sprawdź, czy suma komponentów CM równa się liczbie próbek testowych
println("Sum of CM components: $(TP + TN + FP + FN) (Should be $(total_test_samples))")

# Możesz teraz bezpiecznie obliczyć Precision, Recall, F1, używając tych (poprawionych) wartości TP, TN, FP, FN.
if (TP + FP) > 0
    precision = TP / (TP + FP)
    println("Precision (for class 1): $(precision)")
else
    println("Precision (for class 1): N/A (No positive predictions)")
end

if (TP + FN) > 0
    recall = TP / (TP + FN)
     println("Recall (for class 1): $(recall)")
else
     println("Recall (for class 1): N/A (No actual positive samples of class 1)")
end

if (precision + recall) > 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    println("F1 Score (for class 1): $(f1_score)")
else
    println("F1 Score (for class 1): N/A")
end


println("\n--- Test Evaluation Finished ---")


--- Starting Test Evaluation ---
Test Accuracy: 100.0 %

Confusion Matrix:
TP: 12, TN: 8, FP: 0, FN: 0
Sum of CM components: 20 (Should be 20)
Precision (for class 1): 1.0
Recall (for class 1): 1.0
F1 Score (for class 1): 1.0

--- Test Evaluation Finished ---


## IMDB Final Test

In [11]:
X_train = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "X_train"));
y_train = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "y_train"));
X_test = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "X_test"));
y_test = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "y_test"));


In [14]:
#   Ustawienia sieci neuronowej
features = size(X_train, 1)
hidden = 32
out = 1
epochs = 5
batch_size = 64

x = Constant(zeros(Float32, features, batch_size))
w1 = Variable(xavier_uniform((hidden, features)); name="w1")
z1_mul = w1 * x
z1_mul.name = "z1_mul"
b1 = Variable(xavier_uniform((hidden, 1)); name="b1")
z1 = z1_mul + b1
z1.name = "z1"
d1 = relu(z1)
d1.name = "d1"
w2 = Variable(xavier_uniform((out, hidden)); name="w2")
z2_mul = w2 * d1
z2_mul.name = "z2_mul"
b2 = Variable(xavier_uniform((out, 1)); name="b2")
z2 = z2_mul + b2
z2.name = "z2"
ŷ = σ(z2)
ŷ.name = "ŷ"
y = Constant(zeros(Float32, out, batch_size))
loss = binarycrossentropy(ŷ, y)
loss.name = "loss"
order = topological_sort(loss)

UndefVarError: UndefVarError: `X_train` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [10]:
#   Start ADAM   
optimizer = init!(order)

Adam(0.001f0, 0.9f0, 0.999f0, 1.0f-8, Dict{String, Matrix{Float32}}("b2" => [0.0;;], "w2" => [0.0 0.0 … 0.0 0.0], "w1" => [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], "b1" => [0.0; 0.0; … ; 0.0; 0.0;;]), Dict{String, Matrix{Float32}}("b2" => [0.0;;], "w2" => [0.0 0.0 … 0.0 0.0], "w1" => [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], "b1" => [0.0; 0.0; … ; 0.0; 0.0;;]), 0, Tuple{String, Variable}[("w2", var w2
 ┣━ ^ 1×32 Matrix{Float32}
 ┗━ ∇ 1×32 Matrix{Float32}), ("w1", var w1
 ┣━ ^ 32×17703 Matrix{Float32}
 ┗━ ∇ 32×17703 Matrix{Float32}), ("b1", var b1
 ┣━ ^ 32×1 Matrix{Float32}
 ┗━ ∇ 32×1 Matrix{Float32}), ("b2", var b2
 ┣━ ^ 1×1 Matrix{Float32}
 ┗━ ∇ 1×1 Matrix{Float32})])

In [11]:
using Printf
num_training_samples = size(X_train, 2) # Liczba próbek w zbiorze treningowym


#   Run training and visualize results
loss_value = 0.0
for epoch in 1:epochs
    # --- Tasowanie zbioru treningowego NA NOWO w każdej epoce ---
    permutation = randperm(num_training_samples)
    X_train_shuffled_epoch = X_train[:, permutation]
    y_train_shuffled_epoch = y_train[:, permutation]
    num_batches = ceil(Int, num_training_samples / batch_size)

    total_loss = 0.0
    total_acc = 0.0
    num_samples = 0

    t = @elapsed begin
        println("Epoch $epoch")
        for i in 1:num_batches
            # Wybierz batch z POTASOWANYCH W TEJ EPOCE danych
            start_idx = (i - 1) * batch_size + 1
            end_idx = min(i * batch_size, num_training_samples)
            x_batch = X_train_shuffled_epoch[:, start_idx:end_idx]
            y_batch = y_train_shuffled_epoch[:, start_idx:end_idx]
    
            x.output = x_batch
            y.output = y_batch
    
            forward!(order)
            backward!(order)
            step!(optimizer)

            num_samples = i
        end
    end
    loss_value = loss.output

    println(@sprintf("Loss after epoch %d (%.2fs): %.2f", epoch, t, loss_value))
end

Epoch 1
Loss after epoch 1 (4.87s): 0.64
Epoch 2
Loss after epoch 2 (1.42s): 0.55
Epoch 3
Loss after epoch 3 (1.39s): 0.48
Epoch 4
Loss after epoch 4 (1.45s): 0.30
Epoch 5
Loss after epoch 5 (1.40s): 0.26


In [14]:
println("\n--- Starting Test Evaluation ---")

x.output = X_test
y.output = y_test

forward!(order)

predictions_prob = ŷ.output

predicted_classes = (predictions_prob .> 0.5) # Wynik to BitMatrix (1, liczba_próbek_testowych)

true_classes = convert.(Bool, y_test) # Wynik to BitMatrix (1, liczba_próbek_testowych)

# --- Obliczanie Dokładności ---
# Najprostsze obliczenie powinno działać poprawnie na BitMatrix
correct_predictions = sum(predicted_classes .== true_classes) # Sumuje true w macierzy wynikowej porównania element-wise

total_test_samples = size(X_test, 2)
accuracy = correct_predictions / total_test_samples

println("Test Accuracy: $(accuracy * 100.0) %")

# --- Obliczanie Macierzy Pomyłek (Confusion Matrix) - POPRAWIONE ---
# Używamy standardowych operatorów logicznych na macierzach Boolowskich
# TP: predicted = true AND true = true
TP = sum(predicted_classes .& true_classes)

# TN: predicted = false AND true = false
TN = sum(.!predicted_classes .& .!true_classes)

# FP: predicted = true AND true = false
FP = sum(predicted_classes .& .!true_classes)

# FN: predicted = false AND true = true
FN = sum(.!predicted_classes .& true_classes)

println("\nConfusion Matrix:")
println("TP: $(TP), TN: $(TN), FP: $(FP), FN: $(FN)")

# Ważna weryfikacja: Sprawdź, czy suma komponentów CM równa się liczbie próbek testowych
println("Sum of CM components: $(TP + TN + FP + FN) (Should be $(total_test_samples))")

# Możesz teraz bezpiecznie obliczyć Precision, Recall, F1, używając tych (poprawionych) wartości TP, TN, FP, FN.
if (TP + FP) > 0
    precision = TP / (TP + FP)
    println("Precision (for class 1): $(precision)")
else
    println("Precision (for class 1): N/A (No positive predictions)")
end

if (TP + FN) > 0
    recall = TP / (TP + FN)
     println("Recall (for class 1): $(recall)")
else
     println("Recall (for class 1): N/A (No actual positive samples of class 1)")
end

if (precision + recall) > 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    println("F1 Score (for class 1): $(f1_score)")
else
    println("F1 Score (for class 1): N/A")
end


println("\n--- Test Evaluation Finished ---")


--- Starting Test Evaluation ---
Test Accuracy: 86.45 %

Confusion Matrix:
TP: 924, TN: 805, FP: 170, FN: 101
Sum of CM components: 2000 (Should be 2000)
Precision (for class 1): 0.8446069469835467
Recall (for class 1): 0.9014634146341464
F1 Score (for class 1): 0.8721094856064181

--- Test Evaluation Finished ---


##  API Test

In [9]:
X_train = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "X_train"));
y_train = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "y_train"));
X_test = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "X_test"));
y_test = Matrix(load("../../dataset/imdb_dataset_prepared.jld2", "y_test"));

In [12]:
# 1. Definicja rozmiarów modelu
input_size = size(X_train, 1) # Liczba cech
hidden_size = 8
output_size = 1
batch_size = 64

# 2. Inicjalizacja modelu (Chain) (raz)
model = Chain(
    Dense(input_size, hidden_size, relu; weight_init=xavier_uniform,  name="layer1"),
    Dense(hidden_size, output_size, σ; weight_init=xavier_uniform, name="layer2")
)

# 3. Utworzenie początkowych węzłów Constant dla danych wejściowych i etykiet (raz)
x_input_node = Constant(zeros(Float32, input_size, batch_size))
y_label_node = Constant(zeros(Float32, output_size, batch_size))

# 4. Budowanie grafu treningowego (raz)
loss_node, model_output_node, order = build_graph!(model, binarycrossentropy, x_input_node, y_label_node; loss_name="loss")

optimizer_state = setup_optimizer(Adam(), model)

# 5. Początek treningu
epochs = 5


for epoch in 1:epochs
    # --- Tasowanie zbioru treningowego NA NOWO w każdej epoce ---
    permutation = randperm(size(X_train, 2))
    X_train_shuffled_epoch = X_train[:, permutation]
    y_train_shuffled_epoch = y_train[:, permutation]
    num_batches = ceil(Int, size(X_train, 2) / batch_size)

    loss_value = 0.0

    t = @elapsed begin

    for i in 1:num_batches

        start_idx = (i - 1) * batch_size + 1
        end_idx = min(i * batch_size, size(X_train, 2))
        x_batch = X_train_shuffled_epoch[:, start_idx:end_idx]
        y_batch = y_train_shuffled_epoch[:, start_idx:end_idx]

        current_batch_size = size(x_batch, 2)
        view(x_input_node.output, :, 1:current_batch_size) .= x_batch
        view(y_label_node.output, :, 1:current_batch_size) .= y_batch


        forward!(order)

        backward!(order)

        step!(optimizer_state)
        loss_value += loss_node.output

    end
end
    avg_loss_epoch = loss_value / num_batches

    println(@sprintf("Epoch: %d (%.2fs) \tTrain: (l: %.2f)", epoch, t, avg_loss_epoch))
end



Epoch: 1 (1.22s) 	Train: (l: 0.67)
Epoch: 2 (1.22s) 	Train: (l: 0.58)
Epoch: 3 (1.22s) 	Train: (l: 0.47)
Epoch: 4 (1.22s) 	Train: (l: 0.38)
Epoch: 5 (1.22s) 	Train: (l: 0.30)


In [11]:
# --- Test Evaluation ---

batch_size = 64
num_test_samples = size(X_test, 2)
num_batches = ceil(Int, num_test_samples / batch_size)
total_test_loss_sum = 0.0
total_correct_predictions = 0.0

t_test = @elapsed begin
    for i in 1:num_batches

        start_idx = (i - 1) * batch_size + 1
        end_idx = min(i * batch_size, num_test_samples)
        x_batch_test = X_test[:, start_idx:end_idx]
        y_batch_test = y_test[:, start_idx:end_idx]

        # Aktualna liczba próbek w bieżącym batchu (może być mniejsza dla ostatniego batcha)
        current_test_batch_size = size(x_batch_test, 2)

        view(x_input_node.output, :, 1:current_test_batch_size) .= x_batch_test
        view(y_label_node.output, :, 1:current_test_batch_size) .= y_batch_test

        forward!(order)

        predictions = view(model_output_node.output, :, 1:current_test_batch_size)


        batch_loss = loss_node.output
        
        total_test_loss_sum += batch_loss * current_test_batch_size # Sumuj stratę, uwzględniając rozmiar batcha

        # --- Oblicz dokładność na bieżącym batchu testowym ---
        # Dla klasyfikacji binarnej z progiem 0.5 (lub innym, w zależności od problemu)
        batch_accuracy = sum((predictions .> 0.5f0) .== y_batch_test) / current_test_batch_size
        total_correct_predictions += batch_accuracy * current_test_batch_size # Sumuj poprawne predykcje
    end
end

# --- Oblicz średnią stratę i średnią dokładność na całym zbiorze testowym ---
avg_test_loss = total_test_loss_sum / num_test_samples
avg_test_accuracy = total_correct_predictions / num_test_samples * 100.0

println(@sprintf("Test Loss (czas: %.2fs): %.4f", t_test, avg_test_loss))
println("Test Accuracy: $avg_test_accuracy %")


Test Loss (czas: 0.39s): 0.3952
Test Accuracy: 86.45 %
