In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra"
data = pd.read_csv(url, header=None)

# Extract features and target variable
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# One-hot encode the target variable
encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(y.reshape(-1, 1))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_onehot, test_size=0.2, random_state=42)

# Define the softmax activation function
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Define the derivative of the softmax activation function
def softmax_derivative(x):
    p = softmax(x)
    return p * (1 - p)

# Define the categorical cross-entropy loss function
def categorical_cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip to avoid log(0)
    return -np.sum(y_true * np.log(y_pred)) / len(y_true)

# Define the derivative of the categorical cross-entropy loss function
def categorical_cross_entropy_loss_derivative(y_true, y_pred):
    return y_pred - y_true

# Initialize weights and biases
input_size = X_train.shape[1]
output_size = y_train.shape[1]
hidden_size = 100

W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

# Training parameters
learning_rate = 0.0001
epochs = 100
l2_lambda = 0.0001 # Regularization parameter

# Training loop
for epoch in range(epochs):
    # Forward pass
    z1 = np.dot(X_train, W1) + b1
    a1 = np.tanh(z1)  # Using tanh activation in the hidden layer
    z2 = np.dot(a1, W2) + b2
    a2 = softmax(z2)  # Using softmax activation in the output layer

    # Compute loss with L2 regularization
    loss = categorical_cross_entropy_loss(y_train, a2)
    l2_regularization = 0.5 * l2_lambda * (np.sum(W1**2) + np.sum(W2**2))
    loss += l2_regularization

    # Backpropagation
    dz2 = categorical_cross_entropy_loss_derivative(y_train, a2)
    dW2 = np.dot(a1.T, dz2) + l2_lambda * W2  # Regularization term added to gradient
    db2 = np.sum(dz2, axis=0, keepdims=True)
    dz1 = np.dot(dz2, W2.T) * (1 - np.power(a1, 2))  # Derivative of tanh activation
    dW1 = np.dot(X_train.T, dz1) + l2_lambda * W1  # Regularization term added to gradient
    db1 = np.sum(dz1, axis=0, keepdims=True)

    # Update weights and biases
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

    # Print loss every 100 epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')

# Evaluate the model on training data
z1_train = np.dot(X_train, W1) + b1
a1_train = np.tanh(z1_train)
z2_train = np.dot(a1_train, W2) + b2
a2_train = softmax(z2_train)

train_accuracy = np.mean(np.argmax(a2_train, axis=1) == np.argmax(y_train, axis=1))
print(f'Training Accuracy: {train_accuracy}')

# Print examples of predicted and actual values from training set
print("Examples of Predicted and Actual Values from Training Set:")
train_sample_indices = np.random.choice(len(X_train), 5, replace=False)
train_sample_predictions = np.argmax(a2_train[train_sample_indices], axis=1)
train_sample_actual = np.argmax(y_train[train_sample_indices], axis=1)
for i in range(5):
    print(f"Example {i+1}: Predicted: {train_sample_predictions[i]}, Actual: {train_sample_actual[i]}")

# Evaluate the model on test data
z1_test = np.dot(X_test, W1) + b1
a1_test = np.tanh(z1_test)
z2_test = np.dot(a1_test, W2) + b2
a2_test = softmax(z2_test)

test_accuracy = np.mean(np.argmax(a2_test, axis=1) == np.argmax(y_test, axis=1))
print(f'Test Accuracy: {test_accuracy}')

# Print examples of predicted and actual values from test set
print("Examples of Predicted and Actual Values from Test Set:")
test_sample_indices = np.random.choice(len(X_test), 5, replace=False)
test_sample_predictions = np.argmax(a2_test[test_sample_indices], axis=1)
test_sample_actual = np.argmax(y_test[test_sample_indices], axis=1)
for i in range(5):
    print(f"Example {i+1}: Predicted: {test_sample_predictions[i]}, Actual: {test_sample_actual[i]}")

Epoch 0, Loss: 18.164158367159814
Epoch 10, Loss: 4.912115945178549
Epoch 20, Loss: 2.5110615094752524
Epoch 30, Loss: 1.7723199047185423
Epoch 40, Loss: 1.4143368489444432
Epoch 50, Loss: 1.1960694914941765
Epoch 60, Loss: 1.048362407705014
Epoch 70, Loss: 0.9406506325950137
Epoch 80, Loss: 0.8592417438497277
Epoch 90, Loss: 0.7961298786736062
Training Accuracy: 0.9283845650752126
Examples of Predicted and Actual Values from Training Set:
Example 1: Predicted: 5, Actual: 5
Example 2: Predicted: 3, Actual: 3
Example 3: Predicted: 5, Actual: 5
Example 4: Predicted: 9, Actual: 9
Example 5: Predicted: 0, Actual: 0
Test Accuracy: 0.8771241830065359
Examples of Predicted and Actual Values from Test Set:
Example 1: Predicted: 6, Actual: 6
Example 2: Predicted: 3, Actual: 9
Example 3: Predicted: 7, Actual: 7
Example 4: Predicted: 5, Actual: 5
Example 5: Predicted: 0, Actual: 0
