In [None]:
import numpy as np

class NetworkGradientCalculator:
    """
    Define your network once, then calculate any gradient by specifying location
    """
    
    def __init__(self, input_to_hidden_weights, hidden_to_output_weights, 
                 hidden_bias=None, output_bias=None):
        """
        Initialize network with predefined weights
        
        Parameters:
        -----------
        input_to_hidden_weights : list/array, shape (n_inputs, n_hidden)
            Weights from input layer to hidden layer
        hidden_to_output_weights : list/array, shape (n_hidden, n_outputs)  
            Weights from hidden layer to output layer
        hidden_bias : list/array, optional, shape (n_hidden,)
            Bias terms for hidden layer
        output_bias : list/array, optional, shape (n_outputs,)
            Bias terms for output layer
        """
        
        # Store network architecture
        self.W1 = np.array(input_to_hidden_weights)  # Input → Hidden
        self.W2 = np.array(hidden_to_output_weights) # Hidden → Output
        self.b1 = np.array(hidden_bias) if hidden_bias is not None else np.zeros(self.W1.shape[1])
        self.b2 = np.array(output_bias) if output_bias is not None else np.zeros(self.W2.shape[1])
        
        # Network dimensions
        self.n_inputs = self.W1.shape[0]
        self.n_hidden = self.W1.shape[1] 
        self.n_outputs = self.W2.shape[1]
        
        # Forward pass storage
        self.x = None
        self.z1 = None  # Hidden pre-activation
        self.h = None   # Hidden activation (after ReLU)
        self.z2 = None  # Output pre-activation
        self.y_hat = None  # Output probabilities (after softmax)
        self.target_class = None
        
        print(f"Network initialized: {self.n_inputs} inputs → {self.n_hidden} hidden → {self.n_outputs} outputs")
        self.show_network()
    
    def show_network(self):
        """Display the network structure and weights"""
        print("\n=== NETWORK STRUCTURE ===")
        print(f"Input → Hidden weights (W1):")
        print(self.W1)
        print(f"Hidden bias (b1): {self.b1}")
        print(f"\nHidden → Output weights (W2):")
        print(self.W2)
        print(f"Output bias (b2): {self.b2}")
    
    def forward_pass(self, inputs, correct_class):
        """
        Perform forward propagation
        
        Parameters:
        -----------
        inputs : list/array, input values
        correct_class : int, index of correct output class (0-indexed)
        """
        self.x = np.array(inputs)
        self.target_class = correct_class
        
        # Hidden layer: z1 = x * W1 + b1, h = ReLU(z1)
        self.z1 = np.dot(self.x, self.W1) + self.b1
        self.h = np.maximum(0, self.z1)
        
        # Output layer: z2 = h * W2 + b2, y = softmax(z2)
        self.z2 = np.dot(self.h, self.W2) + self.b2
        exp_z2 = np.exp(self.z2 - np.max(self.z2))  # For numerical stability
        self.y_hat = exp_z2 / np.sum(exp_z2)
        
        print(f"\n=== FORWARD PASS ===")
        print(f"Input: {self.x}")
        print(f"Hidden pre-activation (z1): {self.z1}")
        print(f"Hidden activation (h): {self.h}")
        print(f"Output pre-activation (z2): {self.z2}")
        print(f"Output probabilities (ŷ): {self.y_hat}")
        print(f"Correct class: {correct_class}")
        print(f"Loss: {-np.log(self.y_hat[correct_class]):.6f}")
        
        return self.y_hat
    
    def get_gradient(self, layer, from_unit, to_unit):
        """
        Calculate gradient for weight at specified location
        
        Parameters:
        -----------
        layer : int, which layer (1 = input→hidden, 2 = hidden→output)
        from_unit : int, source unit index 
        to_unit : int, destination unit index
        
        Returns:
        --------
        gradient : float
        """
        
        if self.h is None:
            raise ValueError("Must call forward_pass() first!")
        
        if layer == 2:
            return self._get_output_gradient(from_unit, to_unit)
        elif layer == 1:
            return self._get_hidden_gradient(from_unit, to_unit)
        else:
            raise ValueError("Layer must be 1 (input→hidden) or 2 (hidden→output)")
    
    def _get_output_gradient(self, from_hidden, to_output):
        """Calculate ∂L/∂w_ij^(2) - gradient for hidden→output weight"""
        
        # Check bounds
        if from_hidden >= self.n_hidden or to_output >= self.n_outputs:
            raise ValueError(f"Invalid indices: from_hidden={from_hidden}, to_output={to_output}")
        
        # Formula: (ŷ_j - t_j) × h_i
        target = 1 if to_output == self.target_class else 0
        gradient = (self.y_hat[to_output] - target) * self.h[from_hidden]
        
        print(f"\n=== ∂L/∂w_{from_hidden}{to_output}^(2) ===")
        print(f"Location: Hidden unit {from_hidden} → Output unit {to_output}")
        print(f"Current weight: W2[{from_hidden},{to_output}] = {self.W2[from_hidden, to_output]}")
        print(f"Target for output {to_output}: {target} ({'✓ correct' if target == 1 else '✗ wrong'} class)")
        print(f"Formula: (ŷ_{to_output} - t_{to_output}) × h_{from_hidden}")
        print(f"Values: ({self.y_hat[to_output]:.6f} - {target}) × {self.h[from_hidden]:.6f}")
        print(f"Gradient: {gradient:.6f}")
        print(f"Direction: {'📉 Decrease weight' if gradient < 0 else '📈 Increase weight'}")
        
        return gradient
    
    def _get_hidden_gradient(self, from_input, to_hidden):
        """Calculate ∂L/∂w_ij^(1) - gradient for input→hidden weight"""
        
        # Check bounds
        if from_input >= self.n_inputs or to_hidden >= self.n_hidden:
            raise ValueError(f"Invalid indices: from_input={from_input}, to_hidden={to_hidden}")
        
        # Step 1: Backpropagate error from output layer
        dL_dh = 0
        for k in range(self.n_outputs):
            target_k = 1 if k == self.target_class else 0
            dL_dh += (self.y_hat[k] - target_k) * self.W2[to_hidden, k]
        
        # Step 2: Apply ReLU derivative
        relu_derivative = 1 if self.z1[to_hidden] > 0 else 0
        
        # Step 3: Calculate delta
        delta = dL_dh * relu_derivative
        
        # Step 4: Final gradient
        gradient = delta * self.x[from_input]
        
        print(f"\n=== ∂L/∂w_{from_input}{to_hidden}^(1) ===")
        print(f"Location: Input unit {from_input} → Hidden unit {to_hidden}")
        print(f"Current weight: W1[{from_input},{to_hidden}] = {self.W1[from_input, to_hidden]}")
        print(f"Step 1: ∂L/∂h_{to_hidden} = {dL_dh:.6f} (backprop from output)")
        print(f"Step 2: ReLU'(z1_{to_hidden}) = {relu_derivative} (z1_{to_hidden} = {self.z1[to_hidden]:.6f})")
        print(f"Step 3: δ_{to_hidden} = {dL_dh:.6f} × {relu_derivative} = {delta:.6f}")
        print(f"Step 4: Gradient = δ_{to_hidden} × x_{from_input} = {delta:.6f} × {self.x[from_input]:.6f}")
        print(f"Gradient: {gradient:.6f}")
        
        return gradient
    
    def get_bias_gradient(self, layer, unit):
        """
        Calculate gradient for bias term
        
        Parameters:
        -----------
        layer : int, which layer (1 = hidden, 2 = output)
        unit : int, which unit's bias
        """
        
        if layer == 2:
            # Output bias gradient: ∂L/∂b_j^(2) = (ŷ_j - t_j)
            target = 1 if unit == self.target_class else 0
            gradient = self.y_hat[unit] - target
            
            print(f"\n=== ∂L/∂b_{unit}^(2) ===")
            print(f"Output bias for unit {unit}")
            print(f"Formula: ŷ_{unit} - t_{unit}")
            print(f"Gradient: {self.y_hat[unit]:.6f} - {target} = {gradient:.6f}")
            
        elif layer == 1:
            # Hidden bias gradient: same as hidden weight but input = 1
            dL_dh = 0
            for k in range(self.n_outputs):
                target_k = 1 if k == self.target_class else 0
                dL_dh += (self.y_hat[k] - target_k) * self.W2[unit, k]
            
            relu_derivative = 1 if self.z1[unit] > 0 else 0
            gradient = dL_dh * relu_derivative
            
            print(f"\n=== ∂L/∂b_{unit}^(1) ===")
            print(f"Hidden bias for unit {unit}")
            print(f"Gradient: {gradient:.6f}")
        
        return gradient
    
    def show_all_gradients(self):
        """Show all gradients in the network"""
        print(f"\n{'='*60}")
        print("ALL GRADIENTS IN THE NETWORK")
        print(f"{'='*60}")
        
        # Output layer gradients
        print("\n🔸 HIDDEN → OUTPUT GRADIENTS:")
        for i in range(self.n_hidden):
            for j in range(self.n_outputs):
                grad = self.get_gradient(2, i, j)
                print(f"  ∂L/∂w_{i}{j}^(2) = {grad:.6f}")
        
        # Hidden layer gradients  
        print("\n🔸 INPUT → HIDDEN GRADIENTS:")
        for i in range(self.n_inputs):
            for j in range(self.n_hidden):
                grad = self.get_gradient(1, i, j)
                print(f"  ∂L/∂w_{i}{j}^(1) = {grad:.6f}")


# ===== TEACHER'S EXAMPLE =====
def teachers_example():
    """Replicate teacher's exact problem"""
    
    print("🎯 TEACHER'S PROBLEM")
    print("="*50)
    
    # Define the network weights (from MATLAB code)
    input_to_hidden = [
        [-0.8, -0.1, 0.4],  # bias → [h1, h2, h3]
        [0.0, 0.9, -0.1]    # x1 → [h1, h2, h3]
    ]
    
    hidden_to_output = [
        [0.0, 0.1, 0.0],    # h1 → [y1, y2, y3]
        [0.4, -0.2, 0.8],   # h2 → [y1, y2, y3]
        [0.0, -0.2, -0.2]   # h3 → [y1, y2, y3]
    ]
    
    output_bias = [0.4, 0.5, 0.0]
    
    # Create network
    net = NetworkGradientCalculator(
        input_to_hidden_weights=input_to_hidden,
        hidden_to_output_weights=hidden_to_output,
        output_bias=output_bias
    )
    
    # Forward pass
    inputs = [1.0, 0.5]  # [bias, x1]
    correct_class = 1    # y2 is correct (0-indexed)
    
    net.forward_pass(inputs, correct_class)
    
    # Calculate teacher's specific gradient
    print("\n🎯 TEACHER'S QUESTION:")
    gradient = net.get_gradient(layer=2, from_unit=1, to_unit=2)  # h2 → y3
    
    print(f"\nTeacher's MATLAB answer: h(2)*y(3) = {gradient:.6f}")
    
    return net


# ===== SIMPLE EXAMPLES =====
def simple_examples():
    """Simple network examples"""
    
    print("\n🔸 SIMPLE EXAMPLE")
    print("="*50)
    
    # Small 2→2→2 network
    W1 = [
        [-0.5, 0.3],   # bias → hidden
        [0.8, -0.2]    # x1 → hidden
    ]
    
    W2 = [
        [0.4, -0.6],   # h1 → output
        [0.1, 0.9]     # h2 → output
    ]
    
    net = NetworkGradientCalculator(W1, W2)
    
    # Forward pass
    net.forward_pass([1.0, 0.7], correct_class=0)
    
    # Calculate specific gradients
    print("\n📍 SPECIFIC GRADIENTS:")
    net.get_gradient(2, 0, 1)  # h1 → y2
    net.get_gradient(2, 1, 0)  # h2 → y1  
    net.get_gradient(1, 1, 0)  # x1 → h1


# ===== YOUR CUSTOM NETWORK =====
def my_custom_network():
    """
    🔧 MODIFY THIS FUNCTION FOR YOUR OWN PROBLEMS!
    """
    
    print("\n🔧 YOUR CUSTOM NETWORK")
    print("="*50)
    
    # ✏️ DEFINE YOUR NETWORK WEIGHTS HERE:
    my_W1 = [
        [-1.0, 0.5, 0.2],   # bias → hidden units
        [0.3, -0.8, 0.6],   # input 1 → hidden units
        [0.1, 0.4, -0.3]    # input 2 → hidden units (if you have more inputs)
    ]
    
    my_W2 = [
        [0.7, -0.4],        # hidden unit 1 → outputs
        [0.2, 0.9],         # hidden unit 2 → outputs
        [-0.5, 0.1]         # hidden unit 3 → outputs
    ]
    
    my_output_bias = [0.1, -0.2]  # Optional
    
    # Create your network
    my_net = NetworkGradientCalculator(
        input_to_hidden_weights=my_W1,
        hidden_to_output_weights=my_W2,
        output_bias=my_output_bias
    )
    
    # ✏️ SET YOUR INPUT AND TARGET:
    my_inputs = [1.0, 0.4, 0.8]  # [bias, input1, input2, ...]
    my_target_class = 1          # Which output should be correct
    
    # Forward pass
    my_net.forward_pass(my_inputs, my_target_class)
    
    # ✏️ CALCULATE THE GRADIENT YOU WANT:
    print("\n📍 YOUR GRADIENT:")
    my_gradient = my_net.get_gradient(
        layer=2,        # 1 = input→hidden, 2 = hidden→output
        from_unit=1,    # Source unit index
        to_unit=0       # Destination unit index
    )
    
    print(f"\nYour gradient: {my_gradient:.6f}")
    
    return my_net


if __name__ == "__main__":
    # Run teacher's example
    teachers_net = teachers_example()
    
    print("\n" + "="*80 + "\n")
    
    # Run simple example
    simple_examples()
    
    print("\n" + "="*80 + "\n")
    
    # Your custom network
    my_net = my_custom_network()
    
    print(f"\n{'='*80}")
    print("🎯 USAGE SUMMARY:")
    print("1. Define network: NetworkGradientCalculator(W1, W2, ...)")
    print("2. Forward pass: net.forward_pass(inputs, target_class)")
    print("3. Get gradient: net.get_gradient(layer, from_unit, to_unit)")
    print("   - layer=1: input→hidden, layer=2: hidden→output")
    print("   - Units are 0-indexed")
    print("4. Modify my_custom_network() for your problems!")

🎯 TEACHER'S PROBLEM
Network initialized: 2 inputs → 3 hidden → 3 outputs

=== NETWORK STRUCTURE ===
Input → Hidden weights (W1):
[[-0.8 -0.1  0.4]
 [ 0.   0.9 -0.1]]
Hidden bias (b1): [0. 0. 0.]

Hidden → Output weights (W2):
[[ 0.   0.1  0. ]
 [ 0.4 -0.2  0.8]
 [ 0.  -0.2 -0.2]]
Output bias (b2): [0.4 0.5 0. ]

=== FORWARD PASS ===
Input: [1.  0.5]
Hidden pre-activation (z1): [-0.8   0.35  0.35]
Hidden activation (h): [0.   0.35 0.35]
Output pre-activation (z2): [0.54 0.36 0.21]
Output probabilities (ŷ): [0.39151295 0.3270191  0.28146795]
Correct class: 1
Loss: 1.117737

🎯 TEACHER'S QUESTION:

=== ∂L/∂w_12^(2) ===
Location: Hidden unit 1 → Output unit 2
Current weight: W2[1,2] = 0.8
Target for output 2: 0 (✗ wrong class)
Formula: (ŷ_2 - t_2) × h_1
Values: (0.281468 - 0) × 0.350000
Gradient: 0.098514
Direction: 📈 Increase weight

Teacher's MATLAB answer: h(2)*y(3) = 0.098514



🔸 SIMPLE EXAMPLE
Network initialized: 2 inputs → 2 hidden → 2 outputs

=== NETWORK STRUCTURE ===
Input → Hidd

ValueError: Invalid indices: from_hidden=3, to_output=2