# XOR ("exclusive OR") Problem Solution

f(0, 0) => 0 

f(1, 1) => 0 

f(0, 1) => 1

f(1, 0) => 1 

In fact, linear models can **not** learn it!

In [37]:
import numpy as np 

np.random.seed(42)
learning_rate = 0.1 
epochs = 10_000 

def relu(z):
    return np.maximum(z, 0)

def relu_derivative(z):
    return np.where(z > 0, 1, 0)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    
def sigmoid_derivative(z):
    return z * (1 - z)


X = np.array([[0, 0], [1, 0], [0, 1], [1, 1]])
y = np.array([[0], [1], [1], [0]])



input_layer_neurons = 2 
hidden_layer_neurons = 4
output_layer_neurons = 1

hidden_weights = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
hidden_bias = np.zeros((1, hidden_layer_neurons))
output_weights = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons)) 
output_bias = np.zeros((1, output_layer_neurons))

for epoch in range(epochs):
    
    # forward propagation 
    
    hidden_layer_input = np.dot(X, hidden_weights) + hidden_bias
    hidden_layer_activation = relu(hidden_layer_input)
    output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
    predicted_outputs = sigmoid(output_layer_input)
    
    # loss calculation 
    error = y - predicted_outputs
    mse = np.mean(np.square(error))
    
    # backpropagation 
    
    d_predicted_outputs = error * sigmoid_derivative(predicted_outputs)
    
    error_hidden_layer = d_predicted_outputs.dot(output_weights.T)
    d_hidden_layer = error_hidden_layer * relu_derivative(hidden_layer_activation)
    
    output_weights = output_weights + learning_rate * hidden_layer_activation.T.dot(d_predicted_outputs)
    output_bias = output_bias + learning_rate * np.sum(d_predicted_outputs, axis=0, keepdims=True)
    
    hidden_weights = hidden_weights + learning_rate * X.T.dot(d_hidden_layer)
    hidden_bias = hidden_bias + learning_rate * np.sum(d_hidden_layer, axis=0, keepdims=True)
    
    if epoch % 1000 == 0:
        print(f"{epoch=} and {mse=}")

print(predicted_outputs)

epoch=0 and mse=0.3025839505181337
epoch=1000 and mse=0.01284320444022707
epoch=2000 and mse=0.003287966924352181
epoch=3000 and mse=0.0017330617326462823
epoch=4000 and mse=0.001147102822539652
epoch=5000 and mse=0.0008463523080943702
epoch=6000 and mse=0.000666258630898571
epoch=7000 and mse=0.0005469114799749259
epoch=8000 and mse=0.0004624541798224118
epoch=9000 and mse=0.00039970804778994396
[[0.02970087]
 [0.98607326]
 [0.98606514]
 [0.01165664]]


In [41]:
hidden_layer_input = np.dot(np.array([[1, 0]]), hidden_weights) + hidden_bias
hidden_layer_activation = relu(hidden_layer_input)
output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
sigmoid(output_layer_input)

array([[0.98607516]])

# Iris dataset

In [43]:
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder

In [47]:
iris = load_iris()
X = iris.data 
y = iris.target

encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y.reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
np.random.seed(42)
learning_rate = 0.1 
epochs = 10_000 

def relu(z):
    return np.maximum(z, 0)

def relu_derivative(z):
    return np.where(z > 0, 1, 0)

def softmax(z):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps/np.sum(exps, axis=1, keepdims=True)

def softmax_derivative(z):
    pass

input_layer_neurons = X_train.shape[1]
hidden_layer_neurons = 10 
output_neurons = y_train.shape[1]

hidden_weights = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
hidden_bias = np.random.uniform(size=(1, hidden_layer_neurons))

output_weights = np.random.uniform(size=(hidden_layer_neurons, output_neurons))
output_bias = np.random.uniform(size=(1, output_neurons))


for epoch in range(epochs):
    # feedforward propagation 
    hidden_layer_input = np.dot(X_train, hidden_weights) + hidden_bias
    hidden_layer_activation = relu(hidden_layer_input)
    output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
    predicted_outputs = softmax(output_layer_input)
    
    loss = - np.mean(np.sum(y_train * np.log(predicted_outputs + 1e-8), axis=1))
    error = y_train  - predicted_outputs # minus_error = y_train - predicted_outputs 
    
    d_output_weights = hidden_layer_activation.T.dot(error) / X_train.shape[0]
    d_output_bias = np.sum(error, axis=0, keepdims=True) / X_train.shape[0]
    
    error_hidden_layer = error.dot(output_weights.T) * relu_derivative(hidden_layer_activation)
    d_hidden_weights = X_train.T.dot(error_hidden_layer) / X_train.shape[0]
    d_hidden_bias = np.sum(error_hidden_layer, axis=0, keepdims=True) / X_train.shape[0]
    
    output_weights += learning_rate * d_output_weights 
    output_bias += learning_rate * d_output_bias 
    hidden_weights += learning_rate * d_hidden_weights
    hidden_bias += learning_rate * d_hidden_bias
    
    if epoch % 1000 == 0:
        print(f"{epoch=} and {loss=}")
        


epoch=0 and loss=5.635810831339497
epoch=1000 and loss=0.07471408351855109
epoch=2000 and loss=0.06213335675673911
epoch=3000 and loss=0.06043267321214014
epoch=4000 and loss=0.05939030463394691
epoch=5000 and loss=0.0585932656597506
epoch=6000 and loss=0.05783386799148585
epoch=7000 and loss=0.05721356849612756
epoch=8000 and loss=0.05660345108130884
epoch=9000 and loss=0.055970161263423385


In [82]:
hidden_layer_input = np.dot(X_test, hidden_weights) + hidden_bias
hidden_layer_activation = relu(hidden_layer_input)
output_layer_input = np.dot(hidden_layer_activation, output_weights) + output_bias
predicted_test_outputs = softmax(output_layer_input)

In [83]:
from sklearn.metrics import accuracy_score
y_hat = np.argmax(predicted_test_outputs, axis=1)
y_test_argmax = np.argmax(y_test, axis=1)

accuracy_score(y_test_argmax, y_hat)

0.9666666666666667