In [1]:
import numpy as np

In [2]:
## N training examples
## M features

X = np.array([
    [1.2, 3.1],
    [1.5, 2.8],
    [1.1, 3.3],
    [4.0, -1.2],
    [4.2, -1.0],
    [3.8, -0.8],
    [-2.5, 4.0],
    [-2.3, 4.2],
    [-2.6, 3.8],
    [-2.4, 4.1]
])

y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2])
print(X)
print("")
print(y)

[[ 1.2  3.1]
 [ 1.5  2.8]
 [ 1.1  3.3]
 [ 4.  -1.2]
 [ 4.2 -1. ]
 [ 3.8 -0.8]
 [-2.5  4. ]
 [-2.3  4.2]
 [-2.6  3.8]
 [-2.4  4.1]]

[0 0 0 1 1 1 2 2 2 2]


In [3]:
n = X.shape[0]
m = X.shape[1]
classes = 3
print(n) ## No. of training examples
print(m) ## No. of features
print(classes) ## No. of classes of y

10
2
3


###### Initializing the parameters. W will be a mXk matrix where every class will have its own column vector of weights and b will be a 1Xk matrix where every class will have its own bias

In [4]:
W = np.zeros((m, classes))
b = np.zeros((1, classes))
print(W)
print(W.shape)
print("")
print(b)
print(b.shape)

[[0. 0. 0.]
 [0. 0. 0.]]
(2, 3)

[[0. 0. 0.]]
(1, 3)


###### The softmax function

In [5]:
def softmax(z):
  exp = np.exp(z)
  return exp/np.sum(exp, axis = 1, keepdims = True)

###### Defining hyperparameters

In [6]:
n_iterations = 1000
learning_rate = 0.1

###### Training the model using gradient descent.

In [10]:
for i in range(n_iterations):
  pred = np.dot(X, W) + b
  pred = softmax(pred)

  y_true = np.zeros_like(pred)
  y_true[np.arange(n), y] = 1 ## For every example gives which class it belngs to like [0, 0, 1] for class 2

  loss = -np.mean(np.log(pred[range(n), y])) ## Loss function
  if(i%100 == 0):
    print(loss)

  dW = (1/n) * np.dot(X.T, (pred - y_true))
  db = (1/n) * np.sum(pred - y_true, axis=0, keepdims=True)

  W -= learning_rate * dW ## Gradient Descent
  b -= learning_rate * db

print("")
print(W)
print("")
print(b)

0.001971544236339901
0.0018782500235573194
0.001793423602009459
0.0017159607955223317
0.0016449414972185227
0.001579592839650206
0.0015192608686111805
0.0014633885147398364
0.001411498287630889
0.0013631785519672849

[[ 0.97005596  1.9105679  -2.88062386]
 [ 1.1952037  -1.78999448  0.59479078]]

[[-0.11425388  0.18583719 -0.07158331]]


###### Predicting on train data to verify

In [None]:
y_pred = np.dot(X, W) + b
y_pred = softmax(y_pred)
np.argmax(y_pred, axis = 1)

array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2])