In [4]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load and preprocess the data
iris = load_iris()
X = iris.data
y = iris.target.reshape(-1, 1)

# One-hot encode the target
encoder = OneHotEncoder(sparse_output=False)  # For sklearn >= 1.2
y_encoded = encoder.fit_transform(y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42
)

# Activation functions and derivatives
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # numerical stability
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Loss and accuracy functions
def cross_entropy_loss(y_true, y_pred):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))

def compute_accuracy(y_true, y_pred):
    return np.mean(np.argmax(y_true, axis=1) == np.argmax(y_pred, axis=1))

# Network architecture
input_size = X_train.shape[1]   # 4 features
hidden_size = 100               # 100 neurons in hidden layer
output_size = y_train.shape[1]  # 3 output classes
learning_rate = 0.01
epochs = 100000

# Initialize weights and biases
np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

# Training loop
for epoch in range(epochs):
    # Forward pass
    Z1 = X_train @ W1 + b1
    A1 = relu(Z1)
    Z2 = A1 @ W2 + b2
    A2 = softmax(Z2)

    # Loss and accuracy
    loss = cross_entropy_loss(y_train, A2)
    acc = compute_accuracy(y_train, A2)

    # Backward pass
    dZ2 = A2 - y_train
    dW2 = A1.T @ dZ2 / X_train.shape[0]
    db2 = np.sum(dZ2, axis=0, keepdims=True) / X_train.shape[0]

    dA1 = dZ2 @ W2.T
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = X_train.T @ dZ1 / X_train.shape[0]
    db1 = np.sum(dZ1, axis=0, keepdims=True) / X_train.shape[0]

    # Update weights and biases
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

    # Print progress
    if epoch % 10000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {acc:.4f}")

# Evaluate on test data
Z1_test = X_test @ W1 + b1
A1_test = relu(Z1_test)
Z2_test = A1_test @ W2 + b2
A2_test = softmax(Z2_test)

test_acc = compute_accuracy(y_test, A2_test)
print(f"\nTest Accuracy: {test_acc:.4f}")

Epoch 0, Loss: 1.0994, Accuracy: 0.1500
Epoch 10000, Loss: 0.0577, Accuracy: 0.9833
Epoch 20000, Loss: 0.0496, Accuracy: 0.9833
Epoch 30000, Loss: 0.0479, Accuracy: 0.9833
Epoch 40000, Loss: 0.0472, Accuracy: 0.9833
Epoch 50000, Loss: 0.0470, Accuracy: 0.9833
Epoch 60000, Loss: 0.0469, Accuracy: 0.9833
Epoch 70000, Loss: 0.0468, Accuracy: 0.9833
Epoch 80000, Loss: 0.0467, Accuracy: 0.9833
Epoch 90000, Loss: 0.0467, Accuracy: 0.9833

Test Accuracy: 1.0000
