In [1]:
using MLDatasets, Flux, Statistics
train_data = MLDatasets.MNIST(split=:train)
test_data  = MLDatasets.MNIST(split=:test)


function loader(data)
    dim1, dim2, dim3 = size(data.features)
    x = reshape(data.features, dim1 * dim2, dim3)
    y = data.targets
    #x4dim = reshape(data.features, 28, 28, 1, :) # insert trivial channel dim
    yhot  = Flux.onehotbatch(data.targets, 0:9)  # make a 10×60000 OneHotMatrix
    return x, y, yhot
    #Flux.DataLoader((x4dim, yhot); batchsize, shuffle=true)
end

x1, y1, yhot = loader(train_data);

In [5]:
struct Dual{T <:Number} <:Number
    v::T
   dv::T
end
# Przeciążenie podstawowych operatorów
import Base: +, -, *, /
-(x::Dual)          = Dual(-x.v,       -x.dv)
+(x::Dual, y::Dual) = Dual( x.v + y.v,  x.dv + y.dv)
-(x::Dual, y::Dual) = Dual( x.v - y.v,  x.dv - y.dv)
*(x::Dual, y::Dual) = Dual( x.v * y.v,  x.dv * y.v + x.v * y.dv)
/(x::Dual, y::Dual) = Dual( x.v / y.v, (x.dv * y.v - x.v * y.dv)/y.v^2)
# Przeciążenie podstawowych funkcji
import Base: abs, sin, cos, tan, exp, sqrt, isless, log
abs(x::Dual)  = Dual(abs(x.v),sign(x.v)*x.dv)
sin(x::Dual)  = Dual(sin(x.v), cos(x.v)*x.dv)
cos(x::Dual)  = Dual(cos(x.v),-sin(x.v)*x.dv)
tan(x::Dual)  = Dual(tan(x.v), one(x.v)*x.dv + tan(x.v)^2*x.dv)
exp(x::Dual)  = Dual(exp(x.v), exp(x.v)*x.dv)
sqrt(x::Dual) = Dual(sqrt(x.v),.5/sqrt(x.v) * x.dv)
log(x::Dual)  = Dual(log(x.v), x.dv/x.v)
isless(x::Dual, y::Dual) = x.v < y.v;
# Promocja typów i konwersja
import Base: convert, promote_rule
convert(::Type{Dual{T}}, x::Dual) where T = Dual(convert(T, x.v), convert(T, x.dv))
convert(::Type{Dual{T}}, x::Number) where T = Dual(convert(T, x), zero(T))
promote_rule(::Type{Dual{T}}, ::Type{R}) where {T,R} = Dual{promote_type(T,R)}
# Pomocne funkcje
import Base: show
show(io::IO, x::Dual) = print(io, "(", x.v, ") + [", x.dv, "ϵ]");
value(x::Dual) = x.v;
partials(x::Dual) = x.dv;
ReLU(x) = max(zero(x), x)
σ(x) = one(x) / (one(x) + exp(-x))
tanh(x) = 2.0 / (one(x) + exp(-2.0x)) - one(x)
ϵ = Dual(0., 1.)
D = derivative(f, x) = partials(f(Dual(x, one(x))))
J = function jacobian(f, args::Vector{T}) where {T <:Number}
    jacobian_columns = Matrix{T}[]
    
    for i=1:length(args)
        x = Dual{T}[]
        for j=1:length(args)
            if i == j
                push!(x, Dual(args[j], one(args[j])))
            else
                push!(x, Dual(args[j], zero(args[j])))
            end
        end
        column = partials.([f(x)...])
        push!(jacobian_columns, column[:,:])
    end
    hcat(jacobian_columns...)
end

H = function hessian(f, args::Vector)
    ∇f(x::Vector) = J(f, x)
    J(∇f, args)
end


hessian (generic function with 1 method)

In [3]:
function log_softmax(x)
    # Subtract the max value for numerical stability
    shift_x = x .- max.(x)
    log_exp_sum = log.(sum(exp.(shift_x)))
    res = shift_x .- log_exp_sum
    println(res)
    return res
end


function logit_cross_entropy(y_pred, y_true)
    return (-sum(y_true .* log_softmax(y_pred)))
end

logit_cross_entropy (generic function with 1 method)

In [2]:
a = Vector{Bool}([0, 0, 0, 0, 0, 1, 0, 0, 0, 0]);
Flux.logitcrossentropy([1.9514772580227577, 1.848623792879922, -6.456514365666464, -5.10149420660439, 3.5482988599977925, -2.210665982825723, 4.338133361390976, -8.256227686203022, -9.262014936475973, -1.3432975629321589], a)
#logit_cross_entropy([1.9514772580227577, 1.848623792879922, -6.456514365666464, -5.10149420660439, 3.5482988599977925, -2.210665982825723, 4.338133361390976, -8.256227686203022, -9.262014936475973, -1.3432975629321589], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
#)

7.039678743362121

In [7]:
mean_squared_loss(y::Vector, ŷ::Vector) = sum(0.5(y - ŷ).^2)
fullyconnected(w::Vector, n::Number, m::Number, v::Vector, activation::Function) = activation.(reshape(w, n, m) * v)
n1 = 25
input = 784
output = 10
Wh  = randn(n1,input) # 50 x 784
Wo  = randn(output,n1) # 10 x 50
dWh = similar(Wh)
dWo = similar(Wo)
E = Float64[]
correct = 0
all = 0
x = x1[:, 1]
y = yhot[:,1]
#yhot = yhot[:,1]

function net(x, wh, wo, y)
    x̂ = fullyconnected(wh, n1, input, x, σ)
    ŷ = fullyconnected(wo, output, n1, x̂, u -> u)
    E = logit_cross_entropy(ŷ, y)
end

#Ei = net(x, Wh[:],  Wo[:], y, yhot)

net (generic function with 1 method)

In [10]:
dnet_Wh(x, wh, wo, y) = J(w -> net(x, w, wo, y), wh);
dnet_Wo(x, wh, wo, y) = J(w -> net(x, wh, w, y), wo);
println("dupa1")
for i=101:1000
    Ei  = net(x, Wh[:], Wo[:], y)
    println("Loss:", Ei)
    push!(E, Ei)
    dWh[:] = dnet_Wh(x, Wh[:],  Wo[:], y);
    dWo[:] = dnet_Wo(x, Wh[:], Wo[:], y);
    Wh .-= 0.1dWh
    Wo .-= 0.1dWo
    x = x1[:, i]
    y = yhot[:,i]
end

dupa1
Loss:0.6528753311036898
Loss:6.9603802176279785
Loss:3.931078524913497
Loss:0.3094221998815187
Loss:2.3856341460429835
Loss:1.2105866231086957
Loss:1.2719136335020222
Loss:3.0105453997267086
Loss:0.6001055288727974
Loss:9.209726832173196
Loss:2.7539565848165646
Loss:4.863962249562979
Loss:1.6359622113903183
Loss:0.9143057111944424
Loss:0.0803250000828219
Loss:1.477061033036103
Loss:2.1660688319227126
Loss:5.150468581081785
Loss:6.246379599232945
Loss:2.780069746095853
Loss:0.3029446323574035
Loss:2.7545059606549893
Loss:0.7342529798738292
Loss:1.947271219407915
Loss:2.2430138586774744
Loss:1.017403732307133
Loss:4.0675860717720855
Loss:2.887866597855485
Loss:4.0079472511311725
Loss:0.429221664794923
Loss:0.8329438054345466
Loss:1.0987673200713433
Loss:1.3620602028745799
Loss:6.82186860378852
Loss:1.7521038147708012
Loss:1.7932292193331467
Loss:3.3643772866778736
Loss:0.95134005399703
Loss:3.5781109406228526
Loss:4.270840968103776
Loss:1.2101079236229881
Loss:3.884226173632929
Los

In [45]:
Flux.logitcrossentropy(Ei, yhot[:,1])

6.060088640808289