In [9]:
import numpy as np
import copy, math

In [None]:
X_train = np.array([[5, 8], [1, 1], [10, 9], [3, 2], [6, 5], [7, 8]], dtype=np.float64)
y_train = np.array([1.0, 0, 1, 0, 0, 1])

In [15]:
def sigmoid(z):
    z = np.asarray(z, dtype=np.float64)
    out = np.where(z >= 0, 1 / (1 + np.exp(-z)), np.exp(z) / (1 + np.exp(z)))
    return out

In [16]:
def log_1pexp(z):
    z = np.asarray(z, dtype=np.float64)
    out = np.where(z >= 0, z + np.log1p(np.exp(-z)), np.log1p(np.exp(z)))
    return out

In [18]:
def compute_cost(X, y, w, b, lambda_=0):
    m = X.shape[0]
    z = np.dot(X, w) + b
    cost = -y * z + log_1pexp(z)
    cost = np.sum(cost) + (lambda_ / 2) * np.sum(w**2)
    cost = cost / m
    return cost

In [17]:
def compute_gradients(X, y, w, b, lambda_=0):
    m = X.shape[0]
    z = np.dot(X, w) + b
    f_wb = sigmoid(z)
    e = f_wb - y
    dj_db = (1 / m) * np.sum(e)
    dj_dw = (1 / m) * (np.dot(X.T, e) + lambda_ * w)
    return dj_db, dj_dw

In [35]:
def gradient_descent(X, y, w_in, b_in, alpha, num_iters, lambda_=0):
    J_history = []
    params_history = []
    grads_history = []
    iter_history = []
    w = copy.deepcopy(w_in)
    b = b_in
    lambda_ = lambda_
    alpha = alpha
    save_interval = math.ceil(num_iters / 100000)
    cost_tol = 1e-6
    prev_cost = None

    for i in range(num_iters):
        dj_db, dj_dw = compute_gradients(X, y, w, b, lambda_)
        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        cost = compute_cost(X, y, w, b, lambda_)
        if i == 0 or i % save_interval == 0:
            J_history.append(cost)
            params_history.append([w, b])
            grads_history.append([dj_dw, dj_db])
            iter_history.append(i)

        if i % math.ceil(num_iters / 10) == 0:
            print(
                f"Iter {i}: Cost = {cost:.5f}, w = {w}, b = {b:.5f}, dj_dw = {dj_dw}, dj_db = {dj_db:.5f}"
            )
            # Early stopping
        if prev_cost is not None and abs(cost - prev_cost) < cost_tol:
            print(
                f"Early stopping at iter {i} — Δloss = {abs(cost - prev_cost):.2e} < {cost_tol}"
            )
            break
        prev_cost = cost
    print(
        f"Final w : {w}, Final b : {b:.5f}, cost : {cost:.5f}, dj_dw = {dj_dw}, dj_db = {dj_db:.5f}"
    )
    return w, b, J_history, params_history, grads_history, iter_history

In [36]:
w_in = np.zeros_like(X_train[0])
b_in = 0.0
alpha = 0.01
lambda_ = 0.5
w_out, b_out, j_hist, params_hist, grads_hist, iter_hist = gradient_descent(
    X_train, y_train, w_in, b_in, alpha, num_iters=1, lambda_=lambda_
)

Iter 0: Cost = 0.66586, w = [0.01       0.01416667], b = 0.00000, dj_dw = [-1.         -1.41666667], dj_db = 0.00000
Final w : [0.01       0.01416667], Final b : 0.00000, cost : 0.66586, dj_dw = [-1.         -1.41666667], dj_db = 0.00000


In [None]:
w_in = np.zeros_like(X_train[0])
b_in = 0.0
alpha = 0.01
lambda_ = 0.5
num_iters = 10000
w_out, b_out, j_hist, params_hist, grads_hist, iter_hist = gradient_descent(
    X_train, y_train, w_in, b_in, alpha, num_iters=num_iters, lambda_=lambda_
)

Iter 0: Cost = 0.66586, w = [0.01       0.01416667], b = 0.00000, dj_dw = [-1.         -1.41666667], dj_db = 0.00000
Iter 1000: Cost = 0.32521, w = [-0.46960778  0.79615553], b = -1.26930, dj_dw = [ 0.0078412  -0.02252826], dj_db = 0.09579
Iter 2000: Cost = 0.26119, w = [-0.46318001  0.90631992], b = -2.05470, dj_dw = [-0.0046583  -0.00477104], dj_db = 0.06516
Iter 3000: Cost = 0.22872, w = [-0.41163478  0.93587007], b = -2.61958, dj_dw = [-0.00509238 -0.00195387], dj_db = 0.04931
Iter 4000: Cost = 0.20891, w = [-0.3651383   0.95242745], b = -3.06102, dj_dw = [-0.00418736 -0.00148387], dj_db = 0.03969
Iter 5000: Cost = 0.19556, w = [-0.32740739  0.96656876], b = -3.42373, dj_dw = [-0.00339897 -0.00136029], dj_db = 0.03324
Iter 6000: Cost = 0.18594, w = [-0.29642254  0.97973932], b = -3.73181, dj_dw = [-0.00283056 -0.00127493], dj_db = 0.02861
Iter 7000: Cost = 0.17867, w = [-0.270276    0.99207197], b = -3.99970, dj_dw = [-0.00242032 -0.00119193], dj_db = 0.02512
Iter 8000: Cost = 0.17

In [39]:
w_in = np.zeros_like(X_train[0])
b_in = 0.0
alpha = 0.01
lambda_ = 0
num_iters = 10000
w_out, b_out, j_hist, params_hist, grads_hist, iter_hist = gradient_descent(
    X_train, y_train, w_in, b_in, alpha, num_iters=num_iters, lambda_=lambda_
)

Iter 0: Cost = 0.66585, w = [0.01       0.01416667], b = 0.00000, dj_dw = [-1.         -1.41666667], dj_db = 0.00000
Iter 1000: Cost = 0.26038, w = [-0.71952331  1.06323864], b = -1.26324, dj_dw = [ 0.03947961 -0.05677925], dj_db = 0.09370
Iter 2000: Cost = 0.17446, w = [-1.00898059  1.49383776], b = -2.02172, dj_dw = [ 0.02200468 -0.03360902], dj_db = 0.06215
Iter 3000: Cost = 0.13409, w = [-1.19308914  1.7778563 ], b = -2.55661, dj_dw = [ 0.01563386 -0.02430232], dj_db = 0.04639
Iter 4000: Cost = 0.11031, w = [-1.33138765  1.9935251 ], b = -2.97042, dj_dw = [ 0.01232865 -0.01927035], dj_db = 0.03710
Iter 5000: Cost = 0.09445, w = [-1.44374437  2.16929026], b = -3.30897, dj_dw = [ 0.01028708 -0.01609983], dj_db = 0.03100
Iter 6000: Cost = 0.08303, w = [-1.53922345  2.31870505], b = -3.59633, dj_dw = [ 0.00888897 -0.01390644], dj_db = 0.02670
Iter 7000: Cost = 0.07435, w = [-1.62274347  2.44930601], b = -3.84659, dj_dw = [ 0.00786443 -0.01229068], dj_db = 0.02350
Iter 8000: Cost = 0.06