# %% [markdown]
# # Manual Backpropagation Implementation for Iris Dataset
# 
# This notebook implements a complete neural network with manual backpropagation from scratch.
# We'll build everything step-by-step without using any automatic differentiation libraries.

# %%


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
print("Loading Iris Dataset...")
iris = load_iris()
X = iris.data  # Features: sepal length, sepal width, petal length, petal width
y = iris.target  # Target: 0=setosa, 1=versicolor, 2=virginica

In [None]:
# Convert to DataFrame for better visualization
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = iris.target_names[y]
print("\nDataset Info:")
print(df.head())
print(f"\nDataset shape: {X.shape}")
print(f"Classes: {iris.target_names}")

# %% [markdown]
# ## 2. Data Preprocessing
# 
# We'll standardize the features and one-hot encode the target variable.

# %%

In [None]:
print("Preprocessing data...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# One-hot encode the target variable
def one_hot_encode(y, num_classes):
    """Convert integer labels to one-hot encoded vectors"""
    encoded = np.zeros((len(y), num_classes))
    for i, val in enumerate(y):
        encoded[i, val] = 1
    return encoded

y_encoded = one_hot_encode(y, 3)


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Feature shape: {X_train.shape[1]}")
print(f"Output classes: {y_encoded.shape[1]}")


# %% [markdown]
# ## 3. Manual Neural Network Implementation
# 
# Now we'll implement our neural network class with manual backpropagation.

# %%

In [None]:
class ManualNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        """
        Initialize neural network with random weights and zero biases
        Architecture: Input -> Hidden -> Output
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights with small random values
        # Xavier/Glorot initialization
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        
        # Store activations and derivatives for backpropagation
        self.z1 = None  # Pre-activation (hidden layer)
        self.a1 = None  # Post-activation (hidden layer)
        self.z2 = None  # Pre-activation (output layer)
        self.a2 = None  # Post-activation (output layer)
        
        # Training history
        self.loss_history = []
        self.accuracy_history = []

# %% [markdown]
# ### Activation Functions
# 
# Let's implement our activation functions and their derivatives:

# %%

In [None]:
def sigmoid(self, x):
        """Sigmoid activation function"""
        # Clip x to prevent overflow
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))
    def sigmoid_derivative(self, x):
        """Derivative of sigmoid function"""
        s = self.sigmoid(x)
        return s * (1 - s)
    
    def softmax(self, x):
        """Softmax activation function for output layer"""
        # Subtract max for numerical stability
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)


# %% [markdown]
# ### Forward Pass Implementation
# 
# The forward pass computes predictions by passing data through the network:

# %%

In [None]:
def forward_pass(self, X):
        """
        Forward propagation through the network
        """
        # Hidden layer
        self.z1 = np.dot(X, self.W1) + self.b1  # Linear transformation
        self.a1 = self.sigmoid(self.z1)         # Activation
        
        # Output layer
        self.z2 = np.dot(self.a1, self.W2) + self.b2  # Linear transformation
        self.a2 = self.softmax(self.z2)               # Softmax activation
        
        return self.a2

# %% [markdown]
# ### Loss Function
# 
# We'll use cross-entropy loss for multi-class classification:

# %%

In [None]:
def compute_loss(self, y_true, y_pred):
        """
        Compute cross-entropy loss
        """
        # Add small epsilon to prevent log(0)
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        # Cross-entropy loss
        loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
        return loss

# %% [markdown]
# ### Manual Backpropagation Implementation
# 
# This is the core of our implementation - computing gradients manually using the chain rule:

# %%

In [None]:
def backward_pass(self, X, y_true):
        """
        Manual backpropagation implementation
        """
        m = X.shape[0]  # Number of samples
        
        # Step 1: Compute output layer gradients
        # dL/dz2 = a2 - y_true (derivative of softmax + cross-entropy)
        dz2 = self.a2 - y_true
        
        # dL/dW2 = a1^T * dz2
        dW2 = (1/m) * np.dot(self.a1.T, dz2)
        
        # dL/db2 = mean(dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Step 2: Compute hidden layer gradients
        # dL/da1 = dz2 * W2^T
        da1 = np.dot(dz2, self.W2.T)
        
        # dL/dz1 = da1 * sigmoid'(z1)
        dz1 = da1 * self.sigmoid_derivative(self.z1)
        
        # dL/dW1 = X^T * dz1
        dW1 = (1/m) * np.dot(X.T, dz1)
        
        # dL/db1 = mean(dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2

# %% [markdown]
# ### Parameter Update and Training Methods

# %%

In [None]:
def update_parameters(self, dW1, db1, dW2, db2):
        """
        Update parameters using gradient descent
        """
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
        
    def train_step(self, X, y):
        """
        Single training step: forward pass + backward pass + parameter update
        """
        # Forward pass
        predictions = self.forward_pass(X)
        
        # Compute loss
        loss = self.compute_loss(y, predictions)
        
        # Backward pass
        dW1, db1, dW2, db2 = self.backward_pass(X, y)
        
        # Update parameters
        self.update_parameters(dW1, db1, dW2, db2)
        
        return loss, predictions
    
    def predict(self, X):
        """Make predictions"""
        return self.forward_pass(X)
    
    def calculate_accuracy(self, X, y_true):
        """Calculate accuracy"""
        predictions = self.predict(X)
        predicted_classes = np.argmax(predictions, axis=1)
        true_classes = np.argmax(y_true, axis=1)
        accuracy = np.mean(predicted_classes == true_classes)
        return accuracy
    
    def train(self, X_train, y_train, X_val, y_val, epochs=1000, print_every=100):
        """
        Train the neural network
        """
        print(f"Training neural network for {epochs} epochs...")
        print(f"Architecture: {self.input_size} -> {self.hidden_size} -> {self.output_size}")
        print(f"Learning rate: {self.learning_rate}")
        print("-" * 50)
        
        for epoch in range(epochs):
            # Training step
            loss, _ = self.train_step(X_train, y_train)
            
            # Calculate accuracies
            train_acc = self.calculate_accuracy(X_train, y_train)
            val_acc = self.calculate_accuracy(X_val, y_val)
            
            # Store history
            self.loss_history.append(loss)
            self.accuracy_history.append(train_acc)
            
            # Print progress
            if epoch % print_every == 0 or epoch == epochs - 1:
                print(f"Epoch {epoch:4d} | Loss: {loss:.4f} | "
                      f"Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")


# %% [markdown]
# ## 4. Train the Neural Network
# 
# Now let's create and train our neural network:

# %%

In [None]:
# Initialize and train the neural network
print("\n" + "="*60)
print("MANUAL BACKPROPAGATION NEURAL NETWORK")
print("="*60)

# Create neural network
nn = ManualNeuralNetwork(
    input_size=4,      # 4 features in Iris dataset
    hidden_size=8,     # 8 hidden neurons
    output_size=3,     # 3 classes
    learning_rate=0.1
)

# Train the network
nn.train(X_train, y_train, X_test, y_test, epochs=1000, print_every=200)


# %% [markdown]
# ## 5. Evaluate the Model

# %%

In [None]:
# Final evaluation
print("\n" + "="*60)
print("FINAL EVALUATION")
print("="*60)

train_accuracy = nn.calculate_accuracy(X_train, y_train)
test_accuracy = nn.calculate_accuracy(X_test, y_test)

print(f"Final Training Accuracy: {train_accuracy:.4f}")
print(f"Final Test Accuracy: {test_accuracy:.4f}")

# Make predictions on test set
test_predictions = nn.predict(X_test)
predicted_classes = np.argmax(test_predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

print("\nSample Predictions:")
for i in range(5):
    pred_class = iris.target_names[predicted_classes[i]]
    true_class = iris.target_names[true_classes[i]]
    confidence = test_predictions[i][predicted_classes[i]]
    print(f"Sample {i+1}: Predicted={pred_class}, True={true_class}, Confidence={confidence:.3f}")

# %% [markdown]
# ## 6. Visualize Training Progress

# %%

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(nn.loss_history)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Cross-Entropy Loss')
plt.grid(True)

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(nn.accuracy_history)
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(True)

plt.tight_layout()
plt.show()


# %% [markdown]
# ## 7. Detailed Backpropagation Analysis
# 
# Let's examine the internals of our backpropagation implementation:

# %%


In [None]:
# Detailed analysis of the backpropagation process
print("\n" + "="*60)
print("BACKPROPAGATION ANALYSIS")
print("="*60)

print("Network Architecture:")
print(f"Input Layer: {nn.input_size} neurons (features)")
print(f"Hidden Layer: {nn.hidden_size} neurons with sigmoid activation")
print(f"Output Layer: {nn.output_size} neurons with softmax activation")

print(f"\nWeight Matrices:")
print(f"W1 shape: {nn.W1.shape} (input to hidden)")
print(f"W2 shape: {nn.W2.shape} (hidden to output)")

print(f"\nBias Vectors:")
print(f"b1 shape: {nn.b1.shape} (hidden layer bias)")
print(f"b2 shape: {nn.b2.shape} (output layer bias)")

# %% [markdown]
# ### Step-by-Step Forward and Backward Pass Demo

# %%

In [None]:
# Demonstrate one forward and backward pass
print(f"\nDemonstrating one forward-backward pass with first training sample:")
sample_X = X_train[:1]  # First sample
sample_y = y_train[:1]  # First sample target

print(f"Input: {sample_X}")
print(f"Target: {sample_y}")

# Forward pass
output = nn.forward_pass(sample_X)
print(f"Output: {output}")
print(f"Predicted class: {iris.target_names[np.argmax(output)]}")

# Show intermediate activations
print(f"\nIntermediate activations:")
print(f"z1 (hidden pre-activation): {nn.z1}")
print(f"a1 (hidden post-activation): {nn.a1}")
print(f"z2 (output pre-activation): {nn.z2}")
print(f"a2 (output post-activation): {nn.a2}")

# Backward pass
dW1, db1, dW2, db2 = nn.backward_pass(sample_X, sample_y)
print(f"\nGradients:")
print(f"dW1 shape: {dW1.shape}, mean absolute value: {np.mean(np.abs(dW1)):.6f}")
print(f"db1 shape: {db1.shape}, mean absolute value: {np.mean(np.abs(db1)):.6f}")
print(f"dW2 shape: {dW2.shape}, mean absolute value: {np.mean(np.abs(dW2)):.6f}")
print(f"db2 shape: {db2.shape}, mean absolute value: {np.mean(np.abs(db2)):.6f}")


# %% [markdown]
# ## 8. Summary
# 
# This implementation demonstrates complete manual backpropagation including:

# %%

In [None]:
print("\n" + "="*60)
print("IMPLEMENTATION COMPLETE!")
print("="*60)
print("This implementation includes:")
print("1. Manual forward propagation")
print("2. Manual backpropagation with gradient calculations")
print("3. Manual parameter updates using gradient descent")
print("4. No use of automatic differentiation libraries")
print("5. Complete training loop with loss and accuracy tracking")