In [None]:
import numpy as np


# Gradient Descent

In [33]:
def gd(theta, grad, eta, data, max_iter=10):
    # Takes in original parameter estimate theta, gradient function grad, step-size eta
    # Returns parameter estimates obtained after max_iter steps
    thetas = np.zeros((1 + max_iter, theta.shape[0]))
    thetas[0,:] = theta
    for i in range(max_iter):
        theta = theta + eta * grad(theta, data)
        thetas[i+1,:] = theta
    return thetas

# Newton-Raphson

In [65]:
def nr(theta, grad, data, inverse_hessian, max_iter=10):
    # Takes in original parameter estimate theta, gradient function grad
    # Returns parameter estimates obtained after max_iter eta
    thetas = np.zeros((1 + max_iter, theta.shape[0]))
    thetas[0,:] = theta
    for i in range(max_iter):
        theta = theta - inverse_hessian(theta, data) @ grad(theta, data)
        thetas[i+1,:] = theta
    return thetas

# Fisher's scoring

In [93]:
def fisher(theta, grad, data, inverse_info, max_iter=10):
    # Takes in original parameter estimate theta, gradient function grad
    # Returns parameter estimates obtained after max_iter eta
    thetas = np.zeros((1 + max_iter, theta.shape[0]))
    thetas[0,:] = theta
    for i in range(max_iter):
        theta = theta - inverse_info(theta, data) @ grad(theta, data)
        thetas[i+1,:] = theta
    return thetas

# BFGS

In [166]:
def bfgs(f, theta, grad, data, max_iter=10):
    # Takes in original parameter estimate theta, negative likelihood function f, negative gradient function grad
    # Returns parameter estimates obtained after max_iter eta
    thetas = np.zeros((1 + max_iter, theta.shape[0]))
    thetas[0,:] = theta
    I = np.diag(np.ones(theta.shape[0]))
    B = np.copy(I)
    B_inverse = np.copy(I)
    for i in range(max_iter):
        p = -B_inverse @ grad(theta, data)
        alpha = line_search(f, grad, p , theta, 1, data)
        s = alpha * p
        next_theta = (theta + s)
        y = grad(next_theta, data) - grad(theta, data)
        B = B + (y @ y.T)/(y.T @ s) - (B @ s @ s.T @ B.T)/(s.T @ B @ s)
        B_inverse = (I - (s @ y.T)/(y.T @ s)) @ B_inverse @ (I - (s @ y.T)/(y.T @ s)) + (s @ s.T)/(y.T @ s)
        theta = next_theta
        thetas[i+1,:] = theta
    return thetas

def line_search(f, grad, p, theta, alpha, data, lower = 0, upper = float("inf"), c1 = 0.0001, c2 = 0.1):
    if f(theta, data) + c1 * alpha * p.T @ grad(theta, data) - f(theta + alpha * p, data) < 0:
        return line_search(f, grad, p, theta, 0.5*(lower + alpha), data, lower = 0, upper = alpha)
    elif -c2 * p.T @ grad(theta, data) + p.T @ grad(theta + alpha * p, data) < 0:
        return line_search(f, grad, p, theta, 2*alpha, data, lower = alpha, upper = float("inf"))
    return alpha

# Example 1 - Poisson 

In [73]:
np.random.seed(0)
data = np.random.poisson(lam = 56.3, size = 1000)

In [98]:
def l1(theta, data):
    return np.sum(data)*np.log(theta) - len(data)*theta

def neg_l1(theta, data):
    return -l1(theta, data)
    
def grad1(theta, data):
    return np.sum(data)/theta - len(data)

def neg_grad1(theta, data):
    return -grad1(theta, data)

def inverse_hessian1(theta, data):
    return -(theta**2)/np.sum(data)

def inverse_info1(theta, data):
    return -(theta)/len(data)

In [168]:
theta0 = np.array([[5]])

In [183]:
%time
print(gd(theta0, grad1, 1/len(data), data, max_iter = 100).flatten())
print(gd(theta0, grad1, 10/len(data), data, max_iter = 50).flatten())

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 7.39 µs
[ 5.         15.2796     17.97066521 20.10900201 21.91361658 23.48726753
 24.88848342 26.1545114  27.31085067 28.37589071 29.36342303 30.28411191
 31.14640855 31.95714702 32.72194786 33.44550051 34.1317662  34.78412722
 35.40549894 35.99841534 36.56509541 37.10749533 37.62734999 38.12620654
 38.60545153 39.06633329 39.50998043 39.93741723 40.34957665 40.74731126
 41.13140259 41.50256905 41.86147288 42.208726   42.54489521 42.87050666
 43.18604977 43.49198075 43.78872555 44.07668265 44.35622541 44.6277042
 44.89144835 45.14776781 45.3969547  45.63928473 45.87501838 46.10440209
 46.32766923 46.54504107 46.75672761 46.96292836 47.16383303 47.35962219
 47.55046784 47.73653398 47.91797711 48.09494666 48.26758545 48.43603006
 48.6004112  48.76085405 48.91747859 49.07039983 49.21972814 49.3655695
 49.50802568 49.64719451 49.78317008 49.9160429  50.04590009 50.17282557
 50.29690019 50.4182019  50.53680585 50.65278457 50.766208

In [181]:
%%time
nr(theta0, grad1, data, inverse_hessian1, max_iter=10).flatten()

CPU times: user 827 µs, sys: 293 µs, total: 1.12 ms
Wall time: 936 µs


array([ 5.        ,  9.55672187, 17.49404353, 29.56162586, 43.6282036 ,
       53.50662619, 56.24976704, 56.39761039, 56.398     , 56.398     ,
       56.398     ])

In [180]:
%%time
fisher(theta0, grad1, data, inverse_info1, max_iter=10).flatten()

CPU times: user 616 µs, sys: 165 µs, total: 781 µs
Wall time: 657 µs


array([ 5.   , 56.398, 56.398, 56.398, 56.398, 56.398, 56.398, 56.398,
       56.398, 56.398, 56.398])

In [179]:
%%time
bfgs(neg_l1, theta0, neg_grad1, data).flatten()

CPU times: user 5.91 ms, sys: 2.33 ms, total: 8.24 ms
Wall time: 7.48 ms


array([  5.        , 165.61875   ,  10.69020643, 144.91602358,
        44.24479993,  59.85877053,  54.42875174,  56.51883968,
        56.40221936,  56.39799096,  56.398     ])

# Example 2 - Normal Distribution