# XOR Problem 

f(0, 0) => 0 

f(1, 0) => 1

f(0, 1) => 1

f(1, 1) => 0 

In fact, linear models *cannot* solve this problem!

In [4]:
import numpy as np 

np.random.seed(42)
learning_rate = 0.1
epochs = 1_000 

def relu(z):
    return np.maximum(z, 0)

def relu_derivative(z):
    return np.where(z > 0, 1, 0)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    return z * (1 - z)


X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

input_layer_neurons = 2
hidden_layer_neurons = 3 # NOTE: 3 is also valid, 2 seems invalid
output_layer_neurons = 1

hidden_weights = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
hidden_bias = np.zeros((1, hidden_layer_neurons))
output_weights = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
output_bias = np.zeros((1, output_layer_neurons))


for epoch in range(epochs):
    # feedforward process 
    hidden_layer_input = np.dot(X, hidden_weights) + hidden_bias
    hidden_layer_activation = relu(hidden_layer_input)
    output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
    predicted_outputs = sigmoid(output_layer_input)
    
    # loss functions 
    error = y - predicted_outputs 
    mse = np.mean(np.square(error))
    
    # backpropagation 
    d_predicted_outputs = error * sigmoid_derivative(predicted_outputs)
    
    error_hidden_layer = d_predicted_outputs.dot(output_weights.T)
    d_hidden_layer = error_hidden_layer * relu_derivative(hidden_layer_activation)
    
    output_weights = output_weights + learning_rate * hidden_layer_activation.T.dot(d_predicted_outputs)
    output_bias = output_bias + learning_rate * np.sum(d_predicted_outputs, axis=0, keepdims=True)
    
    hidden_weights = hidden_weights + learning_rate * X.T.dot(d_hidden_layer)
    hidden_bias = hidden_bias + learning_rate * np.sum(d_hidden_layer, axis=0, keepdims=True)
    
    if epoch % 100 == 0:
        print(f"{epoch=} and {mse=}") # NOTE: epoch=100
        
    
print(predicted_outputs)

epoch=0 and mse=0.2914809379598231
epoch=100 and mse=0.25132493190245053
epoch=200 and mse=0.2350917411445105
epoch=300 and mse=0.1958778316906303
epoch=400 and mse=0.14556776894980394
epoch=500 and mse=0.09662363196609818
epoch=600 and mse=0.05999120706179112
epoch=700 and mse=0.03842152278682629
epoch=800 and mse=0.02590619393376082
epoch=900 and mse=0.018780635137008426
[[0.1782055 ]
 [0.90234507]
 [0.90253518]
 [0.08097861]]


In [2]:
y

array([[0],
       [1],
       [1],
       [0]])

# Iris dataset 

Classification problem

Number of labels (classes) = 3 
Data is tabular (not image) 

In [8]:
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [10]:
iris = load_iris()
X = iris.data
y = iris.target 

encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y.reshape(-1, 1))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
np.random.seed(42)
learning_rate = 0.01
epochs = 10_000

def relu(z):
    return np.maximum(z, 0)

def relu_derivative(z):
    return np.where(z > 0, 1, 0)

def softmax(z):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    
    return exps / np.sum(exps, axis=1, keepdims=True)



In [37]:
input_layer_neurons = X_train.shape[1]
hidden_layer_neurons = 5 # XXX: you can change it 
output_layer_neurons = y_train.shape[1]

hidden_weights = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
hidden_bias = np.random.uniform(size=(1, hidden_layer_neurons)) # NOTE: we can change it np.random.uniform()
output_weights = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
output_bias = np.zeros((1, output_layer_neurons)) # NOTE: we can change it np.random.uniform()

In [38]:
for epoch in range(epochs):
    
    # forward propagation 
    hidden_layer_input = np.dot(X_train, hidden_weights) + hidden_bias
    hidden_layer_activation = relu(hidden_layer_input)
    
    output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
    predicted_outputs = softmax(output_layer_input)
    
    # log loss 
    
    loss = - np.mean(np.sum(y_train * np.log(predicted_outputs + 1e-8), axis=1))
    error = y_train - predicted_outputs
    
    
    # backpropagation 
    d_output_weights = hidden_layer_activation.T.dot(error) / X_train.shape[0]
    d_output_bias = np.sum(error, axis=0, keepdims=True) / X_train.shape[0]
    
    error_hidden_layer = error.dot(output_weights.T) * relu_derivative(hidden_layer_activation)
    d_hidden_weights = X_train.T.dot(error_hidden_layer) / X_train.shape[0]
    d_hidden_bias = np.sum(error_hidden_layer, axis=0, keepdims=True) / X_train.shape[0]
    
    output_weights += learning_rate * d_output_weights
    output_bias += learning_rate * d_output_bias
    
    hidden_weights += learning_rate * d_hidden_weights
    hidden_bias += learning_rate * d_hidden_bias
    
    if epoch % 1000 == 0:
        print(f"{epoch=} and {loss=}")

print(predicted_outputs)
    

epoch=0 and loss=6.426498398422818
epoch=1000 and loss=0.17483620451744405
epoch=2000 and loss=0.10257819894920667
epoch=3000 and loss=0.08403806062035032
epoch=4000 and loss=0.07621786753883336
epoch=5000 and loss=0.07200393936700504
epoch=6000 and loss=0.06938077754030263
epoch=7000 and loss=0.06758210297826528
epoch=8000 and loss=0.0662553785847467
epoch=9000 and loss=0.06522204988842592
[[9.99635859e-01 3.64141430e-04 2.10633591e-19]
 [9.99919654e-01 8.03457343e-05 7.94089003e-23]
 [4.59913364e-04 9.99452150e-01 8.79363429e-05]
 [9.98814810e-01 1.18518998e-03 8.72076816e-18]
 [9.99079598e-01 9.20401546e-04 1.55798958e-17]
 [1.13944609e-08 1.27504619e-02 9.87249527e-01]
 [1.95041512e-04 9.99080219e-01 7.24739726e-04]
 [9.99595736e-01 4.04264309e-04 1.45404870e-19]
 [9.99704770e-01 2.95230459e-04 7.10890376e-20]
 [9.99858812e-01 1.41187963e-04 1.42272041e-21]
 [1.63282181e-09 2.82167685e-03 9.97178322e-01]
 [1.41317988e-04 9.97498743e-01 2.35993897e-03]
 [9.52550817e-05 9.98297581e-0

In [39]:
hidden_layer_input = np.dot(X_test, hidden_weights) + hidden_bias
hidden_layer_activation = relu(hidden_layer_input)

output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
predicted_test_outputs = softmax(output_layer_input)
    

In [40]:
y_hat = np.argmax(predicted_test_outputs, axis=1)
y_test_argmax = np.argmax(y_test, axis=1)

In [41]:
y_hat

array([1, 0, 2, 1, 1, 0, 1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [42]:
y_test_argmax

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [43]:
accuracy_score(y_test_argmax, y_hat)

0.9666666666666667