In [1]:
import numpy as np

# softmax

In [2]:
batch_size = 3
n_features = 20
n_classes = 4

In [3]:
w = np.random.rand(n_features, n_classes)
x = np.random.rand(batch_size, n_features)
b = np.random.rand(n_classes)

In [4]:
x.shape, w.shape, b.shape

((3, 20), (20, 4), (4,))

In [5]:
pred = np.dot(x, w) + b

In [6]:
pred

array([[6.30459657, 6.94441991, 6.95591216, 5.07969373],
       [5.86920001, 6.74242226, 6.20244926, 4.89216582],
       [5.82581377, 5.23626278, 5.12938576, 4.16661602]])

In [7]:
pred.shape

(3, 4)

In [8]:
exp_scores = np.exp(pred)

In [9]:
exp_scores

array([[ 547.08083691, 1037.34507194, 1049.33526095,  160.72482306],
       [ 353.96569641,  847.61138724,  493.95739276,  133.24183898],
       [ 338.93683682,  187.96631741,  168.91333264,   64.49682628]])

In [10]:
exp_scores.sum(1, keepdims=True).shape

(3, 1)

In [11]:
exp_scores.sum(1, keepdims=True)

array([[2794.48599285],
       [1828.77631539],
       [ 760.31331315]])

In [12]:
exp_scores.sum(1, keepdims=False)

array([2794.48599285, 1828.77631539,  760.31331315])

In [13]:
softmax = exp_scores / exp_scores.sum(1, keepdims=True)

In [14]:
softmax.sum(1)

array([1., 1., 1.])

In [15]:
softmax[0]

array([0.19577154, 0.37121141, 0.37550207, 0.05751499])

In [16]:
softmax[0].sum()

0.9999999999999999

In [17]:
softmax.sum(1)

array([1., 1., 1.])

In [18]:
y = np.random.randint(n_classes, size=(batch_size))

In [19]:
y

array([1, 3, 0])

In [20]:
n_classes

4

In [21]:
range(pred.shape[0]), y

(range(0, 3), array([1, 3, 0]))

In [22]:
softmax

array([[0.19577154, 0.37121141, 0.37550207, 0.05751499],
       [0.1935533 , 0.46348555, 0.27010268, 0.07285847],
       [0.44578575, 0.24722218, 0.2221628 , 0.08482927]])

In [23]:
selected_pred = softmax[range(pred.shape[0]), y]

In [24]:
selected_pred

array([0.37121141, 0.07285847, 0.44578575])

In [25]:
selected_pred.shape

(3,)

In [26]:
loss = - np.log(selected_pred) / batch_size

In [27]:
loss

array([0.33032785, 0.87307884, 0.26930561])

In [28]:
loss.mean()

0.4909041016590225

# sigmoid

In [29]:
data_length = 3
n_features = 20
n_classes = 1

In [30]:
w = np.random.rand(n_features, n_classes) * 0.01
x = np.random.rand(data_length, n_features)
b = np.random.rand(n_classes)

In [31]:
pred = np.dot(x, w) + b

In [32]:
pred

array([[0.31455734],
       [0.33232317],
       [0.31889933]])

In [33]:
pred.shape

(3, 1)

In [34]:
sigmoid = 1 / (1 + np.exp(-pred))

In [35]:
sigmoid

array([[0.57799726],
       [0.58232453],
       [0.57905599]])

In [36]:
sigmoid.shape

(3, 1)

In [37]:
sigmoid = sigmoid.squeeze()

In [38]:
sigmoid.shape

(3,)

In [39]:
y = np.random.randint(n_classes + 1, size=(data_length))

In [40]:
y

array([1, 0, 1])

In [41]:
loss = - (y * np.log(sigmoid) + (1 - y) * np.log(1 - sigmoid)) / batch_size

In [42]:
loss

array([0.18272871, 0.29101685, 0.1821187 ])

In [43]:
loss.mean()

0.21862142118813097

# grad

In [44]:
w = np.random.rand(200, 100)
x = np.random.rand(32, 200)
b = np.random.rand(100)

grad_prev = np.random.rand(32, 100)

In [45]:
(np.dot(x, w) + b).shape

(32, 100)

In [46]:
grad_prev.shape, x.shape, w.shape, b.shape

((32, 100), (32, 200), (200, 100), (100,))

In [47]:
x.T.shape

(200, 32)

In [48]:
grad_w = np.dot(x.T, grad_prev)

In [49]:
grad_w.shape, grad_w.shape == w.shape

((200, 100), True)

In [50]:
grad_x = np.dot(grad_prev, w.T)

In [51]:
grad_x.shape, grad_x.shape == x.shape

((32, 200), True)

In [52]:
b.shape

(100,)

In [53]:
grad_prev.shape

(32, 100)

In [54]:
grad_b = np.dot(np.ones((32, 1)).T, grad_prev)

In [55]:
grad_b.squeeze()

array([17.87615792, 15.03178165, 14.19354793, 16.67462701, 13.77267492,
       16.2800444 , 16.68438411, 14.78816793, 13.99976447, 14.17970054,
       14.09071927, 14.63489448, 18.00838252, 16.29073921, 16.47392878,
       16.14459293, 15.76179655, 15.6027461 , 16.92918758, 18.86956103,
       18.84851172, 16.08203517, 14.97906063, 14.50221373, 16.75076249,
       15.59186577, 17.76227813, 15.68820834, 16.31254786, 15.89369226,
       16.35555889, 13.88542584, 14.58099167, 16.28236615, 14.02996715,
       16.23697404, 15.00671531, 18.19558133, 17.32930262, 16.07215357,
       17.27017006, 14.09443503, 16.66276728, 18.26830776, 14.59825887,
       16.96092658, 18.28505005, 14.46295744, 19.10585999, 14.537708  ,
       17.02946077, 14.62120634, 16.59338853, 15.76398297, 17.84204239,
       18.31496198, 15.68177587, 16.3015043 , 15.218947  , 16.06821618,
       13.97408244, 17.32127385, 13.60634602, 13.70981396, 11.96711744,
       19.77847664, 16.39964064, 17.44758557, 16.87668678, 16.29

In [56]:
grad_b.shape, grad_b.shape == b.shape

((1, 100), False)

In [57]:
b.shape

(100,)

In [58]:
grad_b = grad_b.squeeze()

In [59]:
grad_b.shape, grad_b.shape == b.shape

((100,), True)

In [60]:
learning_rate = 0.1

In [61]:
w = w - learning_rate * grad_w
b = b - learning_rate * grad_b

In [62]:
grad_prev.shape

(32, 100)

In [63]:
b.shape

(100,)

In [64]:
grad_b = grad_prev.sum(0)

In [65]:
grad_b.shape, grad_b.shape == b.shape

((100,), True)

In [66]:
import torch

In [67]:
class MyReLU(torch.autograd.Function):
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-67-2354ac9f5d7f>, line 3)