In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(101)
n = 500
x = np.random.uniform(low=0, high=10, size=n)
y = np.array([1 if x>0.5 else 0 for x in np.random.uniform(low=0, high=1, size=n)])

In [3]:
data = pd.DataFrame({
    "x": x,
    "y": y
})
data.head()

Unnamed: 0,x,y
0,5.163986,1
1,5.706676,0
2,0.284742,1
3,1.715217,1
4,6.85277,1


In [4]:
def f(w,b,x):
  return 1/(1+np.exp(-(w*x+b)))

def error(w,b):
  err = 0
  for x,y in zip(data["x"], data["y"]):
    fx = f(w,b,x)
    err += 0.5 * (fx-y)**2
  return err

def grad_b(w,b,x,y):
  fx = f(w,b,x)
  return (fx-y)*fx*(1-fx)

def grad_w(w,b,x,y):
  fx = f(w,b,x)
  return (fx-y)*fx*(1-fx)*x

def gradient_descent():
  w, b, eta, max_epochs = -2, -2, 0.1, 1000
  for i in range(max_epochs):
    dw, db = 0, 0
    for x,y, in zip(data["x"], data["y"]):
      dw += grad_w(w,b,x,y)
      db += grad_b(w,b,x,y)
    w = w - eta*dw
    b = b - eta*db
  return(b, w)

In [5]:
gradient_descent()

(2.6164905765748183, -7.618195999812198)

In [6]:
def momentum_gradient_descent(init_w, init_b):
  w, b, eta, max_epochs = init_w, init_b, 0.1, 1000
  prev_v_w, prev_v_b, gamma = 0, 0, 0.9
  for i in range(max_epochs):
    dw, db = 0, 0
    for x, y in zip(data["x"], data["y"]):
      dw += grad_w(w,b,x,y)
      db += grad_b(w,b,x,y)

    v_w = gamma * prev_v_w + eta * dw
    v_b = gamma * prev_v_b + eta * db
    w = w - v_w
    b = b - v_b
    prev_v_w = v_w
    prev_v_b = v_b
  return(b, w)

In [7]:
momentum_gradient_descent(-2, -2)

(4.697339084923113, -31.18068393865221)

In [8]:
def nesterov_accelerated_gradient_descent(init_w, init_b):
  w, b, eta, max_epochs = init_w, init_b, 0.1, 1000
  prev_v_w, prev_v_b, gamma = 0, 0, 0.9
  for i in range(max_epochs):
    dw, db = 0, 0
    v_w = gamma * prev_v_w
    v_b = gamma * prev_v_b
    for x, y in zip(data["x"], data["y"]):
      dw += grad_w(w-v_w, b-v_w, x, y)
      db += grad_b(w-v_w, b-v_w, x, y)
    v_w = gamma * prev_v_w + eta * dw
    v_b = gamma * prev_v_b + eta * db
    w = w - v_w
    b = b - v_b
    prev_v_w = v_w
    prev_v_b = v_b
  return(b, w)

In [9]:
nesterov_accelerated_gradient_descent(-2, -2)

  return 1/(1+np.exp(-(w*x+b)))


(-16.9473907384958, -139.70737056392775)

In [10]:
def adagrad(init_w, init_b):
  w, b, eta, max_epochs = init_w, init_b, 0.1, 1000
  v_w, v_b, eps = 0, 0, 1e-8
  for i in range(max_epochs):
    dw, db = 0, 0
    for x, y in zip(data["x"], data["y"]):
      dw += grad_w(w, b, x, y)
      db += grad_b(w, b, x, y)
    v_w = v_w + dw**2
    v_b = v_b + db**2
    w = w - (eta/np.sqrt(v_w + eps)) * dw
    b = b - (eta/np.sqrt(v_b + eps)) * db
  return(w, b)

In [11]:
adagrad(-2, -2)

(-0.07032628082869267, 0.21188126515777667)

In [12]:
import math
def adam(init_w, init_b):
  w_b_dw_db = [(init_w, init_b, 0, 0)]
  w_history, b_history, error_history = [], [], []
  w, b, eta, mini_batch_size, num_points_seen = init_w, init_b, 0.1, 10, 0
  m_w, m_b, v_w, v_b, m_w_hat, m_b_hat, v_w_hat, v_b_hat, eps, beta1, beta2 = 0,0,0,0,0,0,0,0,1e-8,0.9,0.999
  max_epochs = 1000
  for i in range(max_epochs):
    dw, db = 0, 0
    for x, y in zip(data["x"], data["y"]):
      dw += grad_w(w, b, x, y)
      db += grad_b(w, b, x, y)
    m_w = beta1 * m_w + (1-beta1) * dw
    m_b = beta1 * m_b + (1-beta1) * db
    v_w = beta2 * v_w + (1-beta2) * dw**2
    v_b = beta2 * v_b + (1-beta2) * db**2
    m_w_hat = m_w/(1-math.pow(beta1,i+1))
    m_b_hat = m_b/(1-math.pow(beta1,i+1))
    v_w_hat = v_w/(1-math.pow(beta2,i+1))
    v_b_hat = v_b/(1-math.pow(beta2,i+1))
    w = w - (eta/np.sqrt(v_w_hat + eps)) * m_w_hat
    b = b - (eta/np.sqrt(v_b_hat + eps)) * m_b_hat
  return(b, w)


In [13]:
adam(-2, -2)

(0.2118918327207978, -0.07032790517074021)