In [57]:
import numpy as np

# Gradient Descent

In [80]:
def gd(theta, grad, eta, data, max_iter=10):
    # Takes in original parameter estimate theta, gradient function grad, step-size eta
    # Returns parameter estimates obtained after max_iter steps
    thetas = np.zeros((theta.shape[0], max_iter + 1))
    thetas[:, 0] = theta.flatten()
    for i in range(max_iter):
        theta = theta + eta * grad(theta, data)
        thetas[:, i+1] = theta.flatten()
    return thetas

# Newton-Raphson

In [81]:
def nr(theta, grad, data, inverse_hessian, max_iter=10):
    # Takes in original parameter estimate theta, gradient function grad
    # Returns parameter estimates obtained after max_iter eta
    thetas = np.zeros((theta.shape[0], max_iter + 1))
    thetas[:, 0] = theta.flatten()
    for i in range(max_iter):
        theta = theta - inverse_hessian(theta, data) @ grad(theta, data)
        thetas[:, i+1] = theta.flatten()
    return thetas

# Fisher's scoring

In [82]:
def fisher(theta, grad, data, inverse_info, max_iter=10):
    # Takes in original parameter estimate theta, gradient function grad
    # Returns parameter estimates obtained after max_iter eta
    thetas = np.zeros((theta.shape[0], max_iter + 1))
    thetas[:, 0] = theta.flatten()
    for i in range(max_iter):
        theta = theta - inverse_info(theta, data) @ grad(theta, data)
        thetas[:, i+1] = theta.flatten()
    return thetas

# BFGS

In [83]:
def bfgs(f, theta, grad, data, max_iter=10):
    # Takes in original parameter estimate theta, negative likelihood function f, negative gradient function grad
    # Returns parameter estimates obtained after max_iter eta
    thetas = np.zeros((theta.shape[0], max_iter + 1))
    thetas[:, 0] = theta.flatten()
    I = np.diag(np.ones(theta.shape[0]))
    B = np.copy(I)
    B_inverse = np.copy(I)
    for i in range(max_iter):
        p = -B_inverse @ grad(theta, data)
        alpha = line_search(f, grad, p , theta, 1, data)
        s = alpha * p
        next_theta = (theta + s)
        y = grad(next_theta, data) - grad(theta, data)
        B = B + (y @ y.T)/(y.T @ s) - (B @ s @ s.T @ B.T)/(s.T @ B @ s)
        B_inverse = (I - (s @ y.T)/(y.T @ s)) @ B_inverse @ (I - (s @ y.T)/(y.T @ s)) + (s @ s.T)/(y.T @ s)
        theta = next_theta
        thetas[:, i+1] = theta.flatten()
    return thetas

def line_search(f, grad, p, theta, alpha, data, lower = 0, upper = float("inf"), c1 = 0.0001, c2 = 0.1):
    if f(theta, data) + c1 * alpha * p.T @ grad(theta, data) - f(theta + alpha * p, data) < 0:
        return line_search(f, grad, p, theta, 0.5*(lower + alpha), data, lower = 0, upper = alpha)
    elif -c2 * p.T @ grad(theta, data) + p.T @ grad(theta + alpha * p, data) < 0:
        return line_search(f, grad, p, theta, 2*alpha, data, lower = alpha, upper = float("inf"))
    return alpha

# Example 1 - Poisson 

In [84]:
np.random.seed(0)
data = np.random.poisson(lam = 56.3, size = 1000)

In [85]:
def l1(theta, data):
    return np.sum(data)*np.log(theta) - len(data)*theta

def neg_l1(theta, data):
    return -l1(theta, data)
    
def grad1(theta, data):
    return np.sum(data)/theta - len(data)

def neg_grad1(theta, data):
    return -grad1(theta, data)

def inverse_hessian1(theta, data):
    return -(theta**2)/np.sum(data)

def inverse_info1(theta, data):
    return -(theta)/len(data)

In [86]:
theta0 = np.array([[5]])

In [87]:
%%timeit
gd(theta0, grad1, 1/len(data), data, max_iter = 100).flatten()

array([ 5.        , 15.2796    , 17.97066521, 20.10900201, 21.91361658,
       23.48726753, 24.88848342, 26.1545114 , 27.31085067, 28.37589071,
       29.36342303, 30.28411191, 31.14640855, 31.95714702, 32.72194786,
       33.44550051, 34.1317662 , 34.78412722, 35.40549894, 35.99841534,
       36.56509541, 37.10749533, 37.62734999, 38.12620654, 38.60545153,
       39.06633329, 39.50998043, 39.93741723, 40.34957665, 40.74731126,
       41.13140259, 41.50256905, 41.86147288, 42.208726  , 42.54489521,
       42.87050666, 43.18604977, 43.49198075, 43.78872555, 44.07668265,
       44.35622541, 44.6277042 , 44.89144835, 45.14776781, 45.3969547 ,
       45.63928473, 45.87501838, 46.10440209, 46.32766923, 46.54504107,
       46.75672761, 46.96292836, 47.16383303, 47.35962219, 47.55046784,
       47.73653398, 47.91797711, 48.09494666, 48.26758545, 48.43603006,
       48.6004112 , 48.76085405, 48.91747859, 49.07039983, 49.21972814,
       49.3655695 , 49.50802568, 49.64719451, 49.78317008, 49.91

In [88]:
%%timeit
gd(theta0, grad1, 10/len(data), data, max_iter = 50).flatten()

array([  5.        , 107.796     , 103.02791955,  98.50196972,
        94.2275405 ,  90.21283946,  86.46449946,  82.98717644,
        79.78316618,  76.85207592,  74.19058946,  71.79236219,
        69.6480725 ,  67.74564046,  66.07060419,  64.60662421,
        63.33606963,  62.24063228,  61.30191555,  60.50195433,
        59.82363668,  59.25101407,  58.76950095,  58.36597516,
        58.02879666,  57.74776436,  57.51402988,  57.31998507,
        57.15913627,  57.02597536,  56.91585443,  56.82486846,
        56.74974845,  56.68776607,  56.63664991,  56.5945129 ,
        56.55978994,  56.53118482,  56.50762529,  56.4882252 ,
        56.47225281,  56.45910426,  56.44828152,  56.43937398,
        56.43204329,  56.42601067,  56.42104653,  56.41696179,
        56.41360078,  56.41083535,  56.40856002])

In [89]:
%%timeit
nr(theta0, grad1, data, inverse_hessian1, max_iter=10).flatten()

array([ 5.        ,  9.55672187, 17.49404353, 29.56162586, 43.6282036 ,
       53.50662619, 56.24976704, 56.39761039, 56.398     , 56.398     ,
       56.398     ])

In [90]:
%%timeit
fisher(theta0, grad1, data, inverse_info1, max_iter=10).flatten()

array([ 5.   , 56.398, 56.398, 56.398, 56.398, 56.398, 56.398, 56.398,
       56.398, 56.398, 56.398])

In [91]:
%%timeit
bfgs(neg_l1, theta0, neg_grad1, data).flatten()

array([  5.        , 165.61875   ,  10.69020643, 144.91602358,
        44.24479993,  59.85877053,  54.42875174,  56.51883968,
        56.40221936,  56.39799096,  56.398     ])

# Example 2 - Normal Distribution

In [92]:
np.random.seed(0)
data = np.random.normal(50, 5, size = 1000)

In [131]:
def l2(theta, data):
    return -(1/2)*np.sum(((data - theta[0])/theta[1])**2) - len(data)*np.log(theta[1])

def neg_l2(theta, data):
    return -l2(theta, data)
    
def grad2(theta, data):
    return np.array([(np.sum(data) - len(data)*theta[0])/theta[1]**2, 
                     np.sum((data - theta[0])**2/theta[1]**3) - len(data)/theta[1]])

def neg_grad2(theta, data):
    return -grad2(theta, data)

def inverse_hessian2(theta, data):
    return np.linalg.inv(np.r_[np.c_[-len(data)/theta[1]**2, 
                                     (-2*(np.sum(data) - len(data)*theta[0]))/theta[1]**3], 
                               np.c_[(-2*(np.sum(data) - len(data)*theta[0]))/theta[1]**3,
                                     -3*np.sum((data - theta[0])**2/theta[1]**4) + len(data)/theta[1]**2]])

def inverse_info2(theta, data):
    return np.linalg.inv(np.r_[np.c_[len(data)/theta[1]**2, 
                                     0], 
                               np.c_[0,
                                     2*len(data)/theta[1]]])

In [98]:
theta0 = np.array([10,2])[:, None]

In [112]:
#%%timeit
gd(theta0, grad2, 1/len(data), data, max_iter = 10)

array([[ 10.        ,  19.94342912,  19.9441581 ,  19.9448871 ,
         19.94561611,  19.94634515,  19.9470742 ,  19.94780326,
         19.94853235,  19.94926145,  19.94999057],
       [  2.        , 202.28804783, 202.28321483, 202.27838171,
        202.27354847, 202.26871512, 202.26388165, 202.25904807,
        202.25421438, 202.24938056, 202.24454663]])

In [115]:
#%%timeit
gd(theta0, grad1, 10/len(data), data, max_iter = 10)

array([[ 10.        ,  49.77371646,  49.77371646,  49.77371646,
         49.77371646,  49.77371646,  49.77371646,  49.77371646,
         49.77371646,  49.77371646,  49.77371646],
       [  2.        , 240.86858231, 232.93500858, 225.07181561,
        217.28327493, 209.57400423, 201.94899901, 194.41366666,
        186.97386303, 179.63593117, 172.4067419 ]])

In [121]:
#%%timeit
gd(theta0, grad1, 100/len(data), data, max_iter = 100)

array([[1.00000000e+01, 4.07737165e+02, 3.19944469e+02, 2.35501455e+02,
        1.56636660e+02, 8.84132039e+01, 4.47099043e+01, 5.60358337e+01,
        4.48606324e+01, 5.58125156e+01, 4.49927217e+01, 5.56188732e+01,
        4.51095687e+01, 5.54491660e+01, 4.52137559e+01, 5.52990949e+01,
        4.53072886e+01, 5.51653666e+01, 4.53917523e+01, 5.50454094e+01,
        4.54684201e+01, 5.49371818e+01, 4.55383281e+01, 5.48390392e+01,
        4.56023297e+01, 5.47496404e+01, 4.56611352e+01, 5.46678786e+01,
        4.57153415e+01, 5.45928318e+01, 4.57654544e+01, 5.45237245e+01,
        4.58119058e+01, 5.44598993e+01, 4.58550673e+01, 5.44007950e+01,
        4.58952602e+01, 5.43459286e+01, 4.59327646e+01, 5.42948824e+01,
        4.59678251e+01, 5.42472930e+01, 4.60006575e+01, 5.42028424e+01,
        4.60314518e+01, 5.41612511e+01, 4.60603773e+01, 5.41222722e+01,
        4.60875842e+01, 5.40856868e+01, 4.61132072e+01, 5.40513002e+01,
        4.61373671e+01, 5.40189381e+01, 4.61601727e+01, 5.398844

In [123]:
#%%timeit
nr(theta0, grad2, data, inverse_hessian2, max_iter=10)

array([[ 1.00000000e+01, -3.23349838e+01, -1.15640238e+02,
        -2.81643301e+02, -6.13353761e+02, -1.27662782e+03,
        -2.60310263e+03, -5.25601562e+03, -1.05618233e+04,
        -2.11734294e+04, -4.23966371e+04],
       [ 2.00000000e+00,  4.06439598e+00,  8.15840678e+00,
         1.63313413e+01,  3.26699126e+01,  6.53434361e+01,
         1.30688677e+02,  2.61378257e+02,  5.22756964e+02,
         1.04551415e+03,  2.09102842e+03]])

In [125]:
#%%timeit
fisher(theta0, grad2, data, inverse_info2, max_iter=10)

array([[ 1.00000000e+01, -2.97737165e+01, -1.09321149e+02,
        -2.68416015e+02, -5.86605747e+02, -1.22298521e+03,
        -2.49574414e+03, -5.04126199e+03, -1.01322977e+04,
        -2.03143691e+04, -4.06785119e+04],
       [ 2.00000000e+00, -1.98288048e+02, -1.97868827e+02,
        -1.97692380e+02, -1.98487968e+02, -2.03127931e+02,
        -2.22258349e+02, -2.87343839e+02, -4.43800199e+02,
        -7.06488996e+02, -1.12141291e+03]])

In [132]:
#%%timeit
bfgs(neg_l2, theta0, neg_grad2, data)

  


array([[ 1.00000000e+01,  3.20732160e+02,  1.10988431e+03,
         1.07689128e+03,  1.09274097e+03,  1.09274097e+03,
         1.09274097e+03,  1.09274097e+03, -1.08960272e+04,
        -7.41783081e+03,  1.08919716e+07],
       [ 2.00000000e+00,  6.26100149e+03,  9.86721009e+02,
         1.20698102e+03,  1.08211944e+03,  1.08211944e+03,
         1.08211944e+03,  1.08211944e+03, -4.15853555e+03,
        -3.47985155e+02,  5.00329071e+05]])