In [1]:
function nabla(f, x::Float64, delta::Float64)

  ## differentiation of holomorphic functions in a single complex variable applied 
  ## to real-valued functions in a single variable using the Cauchy Integral Formula

  ## f:= the function to be differentiated
  ## x:= where the derivative is evaluated
  ## delta:= the sampling frequency

  N = round(Int,2*pi/delta)
  thetas = vcat(1:N)*delta

  ## collect arguments and rotations: 
  rotations = map(theta -> exp(-im*theta),thetas)
  arguments = x .+ conj.(rotations)  

  ## calculate expectation: 
  expectation = 1.0/N*real(sum(map(f,arguments).*rotations))

  return expectation

end

function partial_nabla(f, i::Int64, X::Array{Float64,1},delta::Float64)

  ## partial differentiation of holomorphic functions in a single complex variable applied 
  ## to real-valued functions in a single variable using the Cauchy Integral Formula

  ## f:= the function to be differentiated
  ## i:= partial differentiation with respect to this index
  ## X:= where the partial derivative is evaluated
  ## delta:= the sampling frequency

  N = length(X)

  kd(i,n) = [j==i for j in 1:n]

  f_i = x -> f(x*kd(i,N) .+ X.*(ones(N)-kd(i,N)))

  return nabla(f_i,X[i],delta)

end


function jacobian(f,X::Array{Float64,1},case::Int64,delta::Float64)
    
    N = Int(length(X))
    
    if case == 1
    
        ## initialise jacobian: 
        J = zeros(N,N)

        for i = 1:N

            f_i(x) = f(x)[i]
            J[i,:] = [partial_nabla(f_i,j,X,delta) for j=1:N]

        end

        return J
        
    else
            
       return [partial_nabla(f,j,X,delta) for j=1:N]

   end
        
end

jacobian (generic function with 1 method)

## We now define the method for gradient updates: 

In [2]:
function gradient_updates(W::Array{Float64,2},X::Array{Float64,1},case::Int64,delta::Float64)

	relu(x) = log.(1 .+ exp.(x))

	if case == 1

		g(Z) = relu([sum(W[1,:].*Z) ,sum(W[2,:].*Z),sum(W[3,:].*Z)] .+W[4,:])

		return jacobian(g,X,1,delta)

	else 

		## there is probably a way to simplify this: 
		g_1(Z) = relu([sum(Z.*X) ,sum(W[2,:].*X),sum(W[3,:].*X)] .+W[4,:])

		g_2(Z) = relu([sum(W[1,:].*X) ,sum(Z.*X),sum(W[3,:].*X)] .+W[4,:])

		g_3(Z) = relu([sum(W[1,:].*X) ,sum(W[2,:].*X),sum(Z.*X)] .+W[4,:])

		g_4(Z) = relu([sum(W[1,:].*X) ,sum(W[2,:].*X),sum(W[3,:].*X)] .+Z)

		return [jacobian(g_1,W[1,:],1,delta), jacobian(g_2,W[2,:],1,delta),jacobian(g_3,W[3,:],1,delta),jacobian(g_4,W[4,:],1,delta)]

	end

end

gradient_updates (generic function with 1 method)

## Testing jacobian on gradient descent: 

In [3]:
function gradient_descent_(f)
    
    x = ones(3)

    L(x) = sum((f(x) .- zeros(3)).^2)

    alpha = 0.1

    for i = 1:100
        
            #update = alpha*2*(f(x) .- zeros(3))'*jacobian(f,x,1,2*pi/10)
            update = alpha*jacobian(L,x,2,2*pi/10)'*jacobian(f,x,1,2*pi/10)

            x -= update'      

    end

    return x
    
end

gradient_descent_ (generic function with 1 method)

In [21]:
## test it on a linear function: 
W = rand(4,3)

f(Z) = [sum(W[1,:].*Z) ,sum(W[2,:].*Z),sum(W[3,:].*Z)] .+W[4,:]

## the result looks reasonable: 
x_ = gradient_descent_(f);

L(x) = (zeros(3) .- f(x)).^2

#L(x_)

f(x_)

3-element Array{Float64,1}:
 -0.0900236607273408
 -0.2360883810597736
  0.4642845054045776

## Test it on learning the identity function: 

In [33]:
using Distributions

function GD(x)
    
    ## delta: 
    delta = 2*pi/100
    
    ## We define a linear function: 
    #W = rand(3,2)
    
    ## we use Xavier initialisation: 
    W = rand(Uniform(-1/sqrt(3),1/sqrt(3)),(4,3))

    f(Z) = [sum(W[1,:].*Z) ,sum(W[2,:].*Z),sum(W[3,:].*Z)] .+W[4,:]

    ## our squared loss: 
    L(x) = sum((x .- f(x)).^2)

    alpha = 0.1

    for i = 1:50

    	X = 10*rand(3)
    
        ## gradient updates:         
        dL = alpha*jacobian(L,X,2,2*pi/100)'
    
        dW_ = gradient_updates(W,X,2,delta)
    
        W[1,:] -= (alpha*dL*dW_[1])'

		W[2,:] -= (alpha*dL*dW_[2])'

		W[3,:] -= (alpha*dL*dW_[3])'

		W[4,:] -= (alpha*dL*dW_[4])'

    end

    return W
    
end

GD (generic function with 1 method)

In [35]:
W_ = GD(zeros(3))

f(Z) = [sum(W_[1,:].*Z) ,sum(W_[2,:].*Z),sum(W_[3,:].*Z)] .+W_[4,:]

f(3*ones(3))

3-element Array{Float64,1}:
 -6.130651445471345
 -6.968973297073677
 -6.193575809252167

## Now, let's try batch gradients: 

In [53]:
function batch_GD(x)
    
    ## delta: 
    delta = 2*pi/100
    
    ## We define a linear function: 
    W = rand(Uniform(-1/sqrt(3),1/sqrt(3)),(4,3))

    f(Z) = [sum(W[1,:].*Z) ,sum(W[2,:].*Z),sum(W[3,:].*Z)] .+W[4,:]

    ## our squared loss: 
    L(x) = sum((x .- f(x)).^2)

    alpha = 0.01

    for i = 1:100
        
        W_ = rand(Uniform(-1/sqrt(3),1/sqrt(3)),(4,3))
        
        for j = 1:10

            X = 10*rand(3)

            ## gradient updates:         
            dL = alpha*jacobian(L,X,2,delta)'

            dW_ = gradient_updates(W,X,2,delta)
            
            W_[1,:] += (alpha*dL*dW_[1])'
            
            W_[2,:] += (alpha*dL*dW_[2])'
            
            W_[3,:] += (alpha*dL*dW_[3])'
            
            W_[4,:] += (alpha*dL*dW_[4])'
            
        end
        
        W[1,:] -= 1/10*W_[1,:]

        W[2,:] -= 1/10*W_[2,:]

        W[3,:] -= 1/10*W_[3,:]

        W[4,:] -= 1/10*W_[4,:]

    end

    return W
    
end

batch_GD (generic function with 1 method)

In [54]:
W_ = batch_GD(zeros(3));

f(Z) = [sum(W_[1,:].*Z) ,sum(W_[2,:].*Z),sum(W_[3,:].*Z)] .+W_[4,:]

f (generic function with 1 method)

In [57]:
f(3*ones(3))

3-element Array{Float64,1}:
 2.4765392348267405 
 2.723247046320566  
 0.21503815400008325

In [None]:
L(x) = sum((x .- f(x)).^2)

N = 100

X = 10*rand(N,3)

sum([L(X[i,:]) for i =1:N])/N

In [None]:
X = rand(3)

W = rand(4,3)

f(Z) = [sum(W[1,:].*Z) ,sum(W[2,:].*Z),sum(W[3,:].*Z)] .+W[4,:]

## our squared loss: 
L(x) = sum((x .- f(x)).^2)
        
## update functions: 
f_1(Z) = [sum(Z.*X) ,sum(W[2,:].*X),sum(W[3,:].*X)] .+W[4,:]

f_2(Z) = [sum(W[1,:].*X) ,sum(Z.*X),sum(W[3,:].*X)] .+W[4,:]

f_3(Z) = [sum(W[1,:].*X) ,sum(W[2,:].*X),sum(Z.*X)] .+W[4,:]

f_4(Z) = [sum(W[1,:].*X) ,sum(W[2,:].*X),sum(W[3,:].*X)] .+Z

## gradient updates: 
dL = alpha*jacobian(L,X,2,2*pi/10)'

#[partial_nabla(L,j,X,delta) for j=1:3]

dW_ = [jacobian(f_1,W[1,:],1,delta), jacobian(f_2,W[2,:],1,delta),jacobian(f_3,W[3,:],1,delta),jacobian(f_4,W[4,:],1,delta)]



In [None]:
(alpha*dL*dW_[1]*dW_[1])'

In [None]:
W[1,:]