In [13]:
# Linear Binary Classification
# Given a set of N points in the D-dimensional space and their labels of boolean values, find a hyperplane that separates the true and false class. Return a classifier that takes in a point vector and returns probility.

import numpy as np

def binary_classifier_from(X, Y):
    '''
        X, a float np.array of shape [N, D]
        Y, a boolean array of labels, shape [N]
    '''
    epsilon = 1e-5

    D = X.shape[1]

    np.random.seed(42)
    W = np.random.randn(D)  # D,
    b = 0  # 1

    lr = 0.001

    def sigmoid(x):
        return (1 + np.exp(-x)) ** -1
    def d_sigmoid(x):
        # -1 * (1 + np.exp(-x)) ** -2 * np.exp(-x) * -1
        # return sigmoid(x) ** 2 * np.exp(-x)  
        return  sigmoid(x) * (1 -  sigmoid(x))
    
    m = X.mean(0)
    std = X.std(0)

    def forward(vectors):
        standardized_v = (vectors - m) / (std + epsilon)
        logit = standardized_v @ W + b  # N
        prob = sigmoid(logit)  # N
        return prob

    for i in range(200):
        # Forward
        prob = forward(X)  # N
        labels = Y * 1.0
        loss = - labels * np.log(prob + epsilon) - (1 - labels) * np.log(1 - prob + epsilon)

        # Backward
        standardized_x = (X - m) / (std + epsilon)
        # d_prob = - labels / (prob + epsilon) + (1 - labels) / (1 - prob + epsilon)  # N
        # d_logit = prob * (1 - prob) * d_prob # N
        d_logit = prob - labels
        d_W = standardized_x.T @ d_logit  # D
        d_b = d_logit.sum()

        W -= lr * d_W
        b -= lr * d_b

        print(f"loss.mean={loss.mean()}")
        # break

    return forward


# train_data = np.array([
#     [0, 1], [1, 2], [3, 4], [1, 1], [2, 3],
# ], dtype=float)
# train_labels = np.array([True, True, True, False, True])

# train_data = np.array([
#     [0, 1], [1, 2],
# ], dtype=float)
# train_labels = np.array([True, False])

train_data = np.concatenate(
    [
        np.random.randn(100, 10) + np.array([50, 20] + [0] * 8, dtype=float),
        np.random.randn(100, 10) + np.array([20, 0] + [0] * 8, dtype=float),
    ], axis=0)
train_labels = np.array([True] * 100 + [False] * 100)

model = binary_classifier_from(train_data, train_labels)
print(model(train_data))

loss.mean=1.0913210588254132
loss.mean=0.9850030766634523
loss.mean=0.8883022642920164
loss.mean=0.8007055281144693
loss.mean=0.7216600587492563
loss.mean=0.650583007706536
loss.mean=0.5868734915700964
loss.mean=0.5299253517707319
loss.mean=0.47913926430831877
loss.mean=0.4339332494308596
loss.mean=0.3937511249071102
loss.mean=0.35806881772524507
loss.mean=0.3263986606318467
loss.mean=0.2982918936929117
loss.mean=0.27333962548098123
loss.mean=0.2511725214961169
loss.mean=0.23145949142293545
loss.mean=0.2139056404773614
loss.mean=0.19824973042811564
loss.mean=0.1842613637665792
loss.mean=0.17173806450426127
loss.mean=0.16050238702408812
loss.mean=0.15039914523624553
loss.mean=0.14129282103771834
loss.mean=0.13306518490335079
loss.mean=0.12561314212803096
loss.mean=0.11884680482198451
loss.mean=0.11268778100065878
loss.mean=0.10706766681870515
loss.mean=0.10192672517350193
loss.mean=0.0972127327705962
loss.mean=0.0928799777284441
loss.mean=0.0888883904939257
loss.mean=0.08520279195733799

In [10]:
# multi-class logistic regression

# loss = numpy.F.cross_entropy(logits, Y)
# what is d_logits?

# probs = softmax(logits)
# loss = -np.log(probs[y]).mean()

# loss = - log(e ** logits[y] / SUM_j(e ** logits[j]) )

# dL / d(logits[i]) = - probs[y] ** -1 * d (e ** logits[y] / SUM_j(e ** logits[j]) / d(logits[i])
# if i != y
#  dL / d(logits[i]) = - probs[y] ** -1  * (-1) e ** logits[y] / SUM_j(e ** logits[j]) ** 2 * e ** logits[i]
#  dL / d(logits[i]) = - probs[y] ** -1  * (-1) prob[y] * prob[i]
#  dL / d(logits[i]) = prob[i]
# if i == y
#  dL / d(logits[i]) = - probs[y] ** -1 * 
#      (e ** logits[i] * SUM_j(e ** logits[j]) - d(SUM_j(e ** logits[j]))/dlogit[i] * e ** logits[i])
#      / SUM_j(e ** logits[j]) ** 2
#  dL / d(logits[i]) = - probs[y] ** -1 *
#      (e ** logits[i] * SUM_j(e ** logits[j]) - e ** logits[i] * e ** logits[i])
#      / SUM_j(e ** logits[j]) ** 2
#  dL / d(logits[i]) = - probs[y] ** -1 * e ** logits[i] *(SUM_j(e ** logits[j]) -  e ** logits[i])
#      / SUM_j(e ** logits[j]) ** 2
#  dL / d(logits[i]) = - probs[y] ** -1 * prob[i] *(1 -  e ** logits[i] / SUM_j(e ** logits[j]) )
#  dL / d(logits[i]) = - (1 - prob[i])
#  dL / d(logits[i]) = prob[i] - 1

def softmax(x, dim):
  x_maxes = np.max(x, dim, keepdims=True)
  norm_x = x - x_maxes # subtract max for numerical stability
  counts = np.exp(norm_x)
  counts_sum = counts.sum(dim, keepdims=True)
  counts_sum_inv = counts_sum**-1
  return counts * counts_sum_inv  # B, V

def grad_of_loss_wrt_logits(logits, Y):
  d_logits = softmax(logits, 1)
  d_logits[range(logits.shape[0]), Y] -= 1.0
  return d_logits / logits.shape[0]

# softmax(np.zeros([2, 2], dtype=float), 1)

test_logits = np.array(
  [
    [0.4, 0.0, -4.0, -1.0],
    [0.4, 0.0, -4.0, -10.0],
  ],
  dtype=float,
)
# test_logits.shape
test_Y = np.array([1, 3], dtype=int)

logits_grad = grad_of_loss_wrt_logits(test_logits, test_Y)
print(f"test_logits={test_logits}")
print(f"softmax(test_logits)={softmax(test_logits, 1)}")
print(f"logits_grad={logits_grad}")

# verify by numerical gradient
epsilon = 0.0001
test_logits_w_d = test_logits.copy()
test_logits_w_d[1, 1] += epsilon
print(f"test_logits_w_d={test_logits_w_d}")

def cross_entropy_loss(logits, labels):
  '''
    prob: dtype float, shape of [B, C], the probability of each class
    labels: dtype int, shape of [B, 1], the class label indices
  '''
  prob = softmax(logits, 1)
  return -np.log(prob)[range(logits.shape[0]), labels].mean()

(cross_entropy_loss(test_logits_w_d, test_Y) - cross_entropy_loss(test_logits, test_Y)) / epsilon

test_logits=[[  0.4   0.   -4.   -1. ]
 [  0.4   0.   -4.  -10. ]]
softmax(test_logits)=[[5.18351093e-01 3.47461129e-01 6.36397256e-03 1.27823806e-01]
 [5.94308491e-01 3.98376895e-01 7.29652735e-03 1.80862831e-05]]
logits_grad=[[ 0.25917555 -0.32626944  0.00318199  0.0639119 ]
 [ 0.29715425  0.19918845  0.00364826 -0.49999096]]
test_logits_w_d=[[ 4.e-01  0.e+00 -4.e+00 -1.e+00]
 [ 4.e-01  1.e-04 -4.e+00 -1.e+01]]


0.1991944394497125