In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
X, y = load_iris()['data'], load_iris()['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1234)

X_train, X_test = X_train.T, X_test.T
y_train = y_train.reshape(1,len(y_train))
y_test = y_test.reshape(1, len(y_test))

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4, 120) (1, 120)
(4, 30) (1, 30)


In [3]:
def OneHotEncode(y):
    shape=(y.shape[1], y.max()+1)
    OHE = np.zeros(shape)
    rows = np.arange(y.shape[1])
    OHE[rows, y]=1
    
    return OHE

In [4]:
y_train = OneHotEncode(y_train)
y_test = OneHotEncode(y_test)

In [5]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4, 120) (120, 3)
(4, 30) (30, 3)


### Main model

In [6]:
def softmax(Z):
    if Z.shape[0] < Z.shape[1]:
        Z = Z.T
    
    exps = np.exp(Z-Z.max())
    sums = np.sum(exps, axis=1, keepdims=True)
    softmax = exps/sums
    return softmax

In [7]:
W = np.random.randn(y_train.shape[1], X_train.shape[0])
b = np.zeros(shape=(y_train.shape[1], 1))

In [8]:
def feed_forward(X):
    Z = np.dot(W, X) + b
    A = softmax(Z)
    return (Z, A)

In [9]:
Z_train, A_train = feed_forward(X_train)

In [10]:
print(Z_train.shape)
print(A_train.shape)

(3, 120)
(120, 3)


In [11]:
def log_loss(pred, gdt):
    m = len(pred)
    loss = -1/m * np.sum(gdt * np.log(pred), keepdims=True)
    return loss

In [12]:
initial_loss = log_loss(A_train, y_train)
print(initial_loss)

[[7.11619399]]


### Gradient Descent

In [13]:
def compute_grads(pred, W, b):
    delta = pred - y_train
    
    W_grad = np.dot(delta.T, X_train.T)
    b_grad = delta.T
    return (W_grad, b_grad)

In [14]:
epochs = 200
learning_rate = 0.01

In [15]:
for epoch in range (epochs):
    Z_train, A_train = feed_forward(X_train)
    loss = log_loss(A_train, y_train)
    
    delta = A_train - y_train
    W_grad = np.dot(delta.T, X_train.T)
    
    W -= learning_rate * W_grad
    #b -= learning_rate * delta.T
    
    if epoch% 50 == 0:
        print(epoch, "  ", loss)

0    [[7.11619399]]
50    [[2.34642095]]
100    [[9.70515845]]
150    [[0.74934144]]
