In [19]:
class SimpleConv1d:
    def __init__(self, F, initializer, optimizer, stride=1):
        self.F = F  
        self.W = initializer.W(F)  
        self.B = initializer.B(1)  
        self.optimizer = optimizer
        self.stride = stride

    def forward(self, x):
        """Forward pass for 1D convolution.
        Args:
            x: Input array (1D, shape: [N_in])
        Returns:
            A: Output array (1D, shape: [N_out])
        """
        N_in = len(x)
        N_out = (N_in - self.F) // self.stride + 1
        A = np.zeros(N_out)
        for i in range(N_out):
            start = i * self.stride
            A[i] = np.sum(x[start:start+self.F] * self.W) + self.B
        self.x = x  
        return A

    def backward(self, dA):
        """Backward pass.
        Args:
            dA: Gradient from next layer (shape: [N_out])
        Returns:
            dW: Gradient for weights (shape: [F])
            dB: Gradient for bias (scalar)
            dx: Gradient for input (shape: [N_in])
        """
        dW = np.zeros_like(self.W)
        dB = np.sum(dA)
        dx = np.zeros_like(self.x)
        for i in range(len(dA)):
            start = i * self.stride
            dx[start:start+self.F] += dA[i] * self.W
            dW += dA[i] * self.x[start:start+self.F]
        self.optimizer.update(self, dW, dB)  # Update weights/bias
        return dx

In [20]:
def conv1d_output_size(N_in, F, P=0, S=1):
    """
    Calculate output size for 1D convolution.
    
    Args:
        N_in (int): Input size (number of features).
        F (int): Filter size.
        P (int): Padding size (default=0).
        S (int): Stride size (default=1).
        
    Returns:
        int: Output size (number of features).
    """
    N_out = (N_in + 2 * P - F) // S + 1
    return N_out

In [21]:
import numpy as np

class SimpleConv1d:
    def __init__(self, w, b, stride=1):
        self.w = w  # Kernel weights
        self.b = b  # Bias
        self.stride = stride
        self.x = None  # To store input for backprop
    
    def forward(self, x):
        """Forward pass for 1D convolution"""
        self.x = x  # Store input for backprop
        F = len(self.w)
        N_out = (len(x) - F) // self.stride + 1
        a = np.zeros(N_out)
        
        for i in range(N_out):
            start = i * self.stride
            window = x[start:start+F]
            a[i] = np.sum(window * self.w) + self.b
        return a
    
    def backward(self, delta_a):
        """Backward pass for 1D convolution"""
        F = len(self.w)
        delta_w = np.zeros_like(self.w)
        delta_b = np.sum(delta_a)
        delta_x = np.zeros_like(self.x)
        
        #weight gradients
        for i in range(len(delta_a)):
            start = i * self.stride
            window = self.x[start:start+F]
            delta_w += delta_a[i] * window
        
    
        
        expanded_delta = np.zeros(len(self.x) - F + 1)
        for i in range(len(delta_a)):
            pos = i * self.stride
            expanded_delta[pos] = delta_a[i]
        
        #
        for j in range(len(delta_x)):
            for k in range(F):
                if 0 <= j - k < len(expanded_delta):
                    delta_x[j] += expanded_delta[j - k] * self.w[F - 1 - k]
        
        return delta_w, delta_b, delta_x

# 
x = np.array([1, 2, 3, 4])
w = np.array([3, 5, 7])
b = np.array([1])
delta_a = np.array([10, 20])

# 
conv = SimpleConv1d(w, b)
a = conv.forward(x)
delta_w, delta_b, delta_x = conv.backward(delta_a)

print("Forward pass output:", a)
print("Weight gradients:", delta_w)
print("Bias gradient:", delta_b)
print("Input gradients:", delta_x)



Forward pass output: [35. 50.]
Weight gradients: [ 50  80 110]
Bias gradient: 30
Input gradients: [ 70 190 130  60]


In [22]:
import numpy as np

class Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        # 
        self.w = np.random.randn(out_channels, in_channels, kernel_size).astype(np.float64) * 0.1
        self.b = np.random.randn(out_channels).astype(np.float64) * 0.1
        
        # 
        self.input_shape = None
        self.input_padded = None
    
    def forward(self, x):
        """
        x shape: (in_channels, num_features)
        output shape: (out_channels, num_output_features)
        """
        #
        x = x.astype(np.float64)
        
        #
        if self.padding > 0:
            x_padded = np.pad(x, ((0,0), (self.padding, self.padding)), mode='constant')
        else:
            x_padded = x
        
        self.input_shape = x.shape
        self.input_padded = x_padded
        
        num_features = x.shape[1]
        num_output_features = ((num_features + 2*self.padding - self.kernel_size) // self.stride) + 1
        
        output = np.zeros((self.out_channels, num_output_features), dtype=np.float64)
        
        for out_ch in range(self.out_channels):
            for i in range(num_output_features):
                start = i * self.stride
                window = x_padded[:, start:start+self.kernel_size]
                output[out_ch, i] = np.sum(window * self.w[out_ch]) + self.b[out_ch]
        
        return output
    
    def backward(self, delta_a):
        """
        delta_a shape: (out_channels, num_output_features)
        Returns:
        - delta_w: gradient for weights (same shape as self.w)
        - delta_b: gradient for bias (same shape as self.b)
        - delta_x: gradient for input (same shape as original input)
        """
        delta_a = delta_a.astype(np.float64)
        
        delta_w = np.zeros_like(self.w)
        delta_b = np.sum(delta_a, axis=1)  
        delta_x_padded = np.zeros_like(self.input_padded)
        
        num_output_features = delta_a.shape[1]
        
        for out_ch in range(self.out_channels):
            for i in range(num_output_features):
                start = i * self.stride
                window = self.input_padded[:, start:start+self.kernel_size]
                
                # Weight gradients
                delta_w[out_ch] += delta_a[out_ch, i] * window
                
                # Input gradients
                delta_x_padded[:, start:start+self.kernel_size] += delta_a[out_ch, i] * self.w[out_ch]
        
        # Remove padding from input gradient 
        if self.padding > 0:
            delta_x = delta_x_padded[:, self.padding:-self.padding]
        else:
            delta_x = delta_x_padded
        
        return delta_w, delta_b, delta_x

########################
x = np.array([[1, 2, 3, 4], [2, 3, 4, 5]], dtype=np.float64)  
w = np.ones((3, 2, 3), dtype=np.float64)  
b = np.array([1, 2, 3], dtype=np.float64)  

conv = Conv1d(in_channels=2, out_channels=3, kernel_size=3)
conv.w = w  
conv.b = b

# 
a = conv.forward(x)
print("Forward pass output:")
print(a)

# Backward pass test
delta_a = np.ones_like(a)  
delta_w, delta_b, delta_x = conv.backward(delta_a)

print("\nWeight gradients:")
print(delta_w)

print("\nBias gradients:")
print(delta_b)

print("\nInput gradients:")
print(delta_x)

# 
assert np.allclose(a, [[16,22], [17,23], [18,24]])


Forward pass output:
[[16. 22.]
 [17. 23.]
 [18. 24.]]

Weight gradients:
[[[3. 5. 7.]
  [5. 7. 9.]]

 [[3. 5. 7.]
  [5. 7. 9.]]

 [[3. 5. 7.]
  [5. 7. 9.]]]

Bias gradients:
[2. 2. 2.]

Input gradients:
[[3. 6. 6. 3.]
 [3. 6. 6. 3.]]


In [23]:
import numpy as np

class Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding='valid'):
        """
        Args:
            padding: 'valid' (no padding), 
                    'same' (pad to maintain input size),
                    'zeros' (zero padding with specified size),
                    'edge' (replicate edge values)
        """
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding_mode = padding
        
        # 
        self.w = np.random.randn(out_channels, in_channels, kernel_size) * 0.1
        self.b = np.random.randn(out_channels) * 0.1
        
        # 
        self.input_shape = None
        self.pad_width = None
    
    def _apply_padding(self, x):
        """Apply padding based on padding mode"""
        if self.padding_mode == 'valid':
            return x, 0
        
        if self.padding_mode == 'same':
            pad_total = (self.kernel_size - 1)
            pad_left = pad_total // 2
            pad_right = pad_total - pad_left
            self.pad_width = ((0, 0), (pad_left, pad_right))
            return np.pad(x, self.pad_width, mode='constant'), pad_left
        
        elif isinstance(self.padding_mode, int):
            self.pad_width = ((0, 0), (self.padding_mode, self.padding_mode))
            return np.pad(x, self.pad_width, mode='constant'), self.padding_mode
        
        elif self.padding_mode == 'edge':
            pad_total = (self.kernel_size - 1)
            pad_left = pad_total // 2
            pad_right = pad_total - pad_left
            self.pad_width = ((0, 0), (pad_left, pad_right))
            return np.pad(x, self.pad_width, mode='edge'), pad_left
        
        raise ValueError(f"Unsupported padding mode: {self.padding_mode}")

    def forward(self, x):
        """Forward pass with padding support"""
        x_padded, pad_left = self._apply_padding(x)
        self.input_shape = x.shape
        self.x_padded = x_padded
        
        num_features = x.shape[1]
        if self.padding_mode == 'same':
            num_output_features = num_features
        else:
            num_output_features = ((num_features + 2*pad_left - self.kernel_size) // self.stride) + 1
        
        output = np.zeros((self.out_channels, num_output_features))
        
        for out_ch in range(self.out_channels):
            for i in range(num_output_features):
                start = i * self.stride
                window = x_padded[:, start:start+self.kernel_size]
                output[out_ch, i] = np.sum(window * self.w[out_ch]) + self.b[out_ch]
        
        return output

    def backward(self, delta_a):
        """Backward pass with padding support"""
        delta_w = np.zeros_like(self.w)
        delta_b = np.sum(delta_a, axis=1)
        
        # Initialize gradient with respect to padded input
        delta_x_padded = np.zeros_like(self.x_padded)
        
        num_output_features = delta_a.shape[1]
        
        for out_ch in range(self.out_channels):
            for i in range(num_output_features):
                start = i * self.stride
                window = self.x_padded[:, start:start+self.kernel_size]
                
                # Weight gradients
                delta_w[out_ch] += delta_a[out_ch, i] * window
                
                # Input gradients
                delta_x_padded[:, start:start+self.kernel_size] += delta_a[out_ch, i] * self.w[out_ch]
        
        # 
        if self.padding_mode == 'valid' or isinstance(self.padding_mode, int):
            pad_left = self.pad_width[1][0] if hasattr(self, 'pad_width') else 0
            delta_x = delta_x_padded[:, pad_left:delta_x_padded.shape[1]-pad_left]
        else:
            delta_x = delta_x_padded
        
        return delta_w, delta_b, delta_x

# Test 
print("=== Test 1: Zero Padding ===")
x = np.array([[1,2,3,4], [5,6,7,8]], dtype=np.float64)
conv = Conv1d(2, 3, kernel_size=3, padding=1)  
conv.w = np.ones_like(conv.w)
conv.b = np.zeros_like(conv.b)
output = conv.forward(x)
print("Forward output (zero padding):\n", output)

print("\n=== Test 2: Same Padding ===")
conv_same = Conv1d(2, 3, kernel_size=3, padding='same') 
conv_same.w = np.ones_like(conv_same.w)
conv_same.b = np.zeros_like(conv_same.b)
output_same = conv_same.forward(x)
print("Forward output (same padding):\n", output_same)
assert output_same.shape[1] == x.shape[1]  

print("\n=== Test 3: Edge Padding ===")
conv_edge = Conv1d(2, 3, kernel_size=3, padding='edge')  
conv_edge.w = np.ones_like(conv_edge.w)
conv_edge.b = np.zeros_like(conv_edge.b)
output_edge = conv_edge.forward(x)
print("Forward output (edge padding):\n", output_edge)

=== Test 1: Zero Padding ===
Forward output (zero padding):
 [[14. 24. 30. 22.]
 [14. 24. 30. 22.]
 [14. 24. 30. 22.]]

=== Test 2: Same Padding ===
Forward output (same padding):
 [[14. 24. 30. 22.]
 [14. 24. 30. 22.]
 [14. 24. 30. 22.]]

=== Test 3: Edge Padding ===
Forward output (edge padding):
 [[20. 24. 30. 34.]
 [20. 24. 30. 34.]
 [20. 24. 30. 34.]]


In [24]:
import numpy as np

class Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        #
        self.w = np.random.randn(out_channels, in_channels, kernel_size) * 0.1
        self.b = np.random.randn(out_channels) * 0.1
        
        #
        self.input_shape = None
        self.x_padded = None
    
    def forward(self, x):
        """Forward pass with mini-batch support"""
        batch_size = x.shape[0]
        
        # 
        if self.padding > 0:
            self.x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding)), mode='constant')
        else:
            self.x_padded = x
        
        self.input_shape = x.shape
        
        num_features = x.shape[2]
        num_output_features = ((num_features + 2*self.padding - self.kernel_size) // self.stride) + 1
        
        output = np.zeros((batch_size, self.out_channels, num_output_features))
        
        # Vectorized implementation
        for i in range(num_output_features):
            start = i * self.stride
            window = self.x_padded[:, :, start:start+self.kernel_size]  # (batch, in_ch, kernel)
            
            #
            output[:, :, i] = np.tensordot(window, self.w, axes=([1,2], [1,2])) + self.b
        
        return output

    def backward(self, delta_a):
        """Backward pass with mini-batch support"""
        batch_size = delta_a.shape[0]
        delta_w = np.zeros_like(self.w)
        delta_b = np.sum(delta_a, axis=(0, 2))  
        
        # 
        delta_x_padded = np.zeros_like(self.x_padded)
        
        num_output_features = delta_a.shape[2]
        
        for i in range(num_output_features):
            start = i * self.stride
            window = self.x_padded[:, :, start:start+self.kernel_size]  # (batch, in_ch, kernel)
            
            #
            delta_w += np.tensordot(delta_a[:, :, i].T, window, axes=([1], [0]))
            
            #
            delta_x_padded[:, :, start:start+self.kernel_size] += np.einsum(
                'bo,oik->bik', 
                delta_a[:, :, i], 
                self.w
            )
        
        #
        if self.padding > 0:
            delta_x = delta_x_padded[:, :, self.padding:-self.padding]
        else:
            delta_x = delta_x_padded
        
        return delta_w, delta_b, delta_x

#####
print("=== Mini-batch Test ===")
x_batch = np.array([
    [[1,2,3,4], [5,6,7,8]],   
    [[2,3,4,5], [6,7,8,9]],    
    [[3,4,5,6], [7,8,9,10]]    
], dtype=np.float64)  

conv = Conv1d(in_channels=2, out_channels=3, kernel_size=3, padding=1)
conv.w = np.ones_like(conv.w)  
conv.b = np.zeros_like(conv.b)  

# 
output = conv.forward(x_batch)
print("Forward output shape:", output.shape)
print("First sample output:\n", output[0])

# 
delta_a = np.ones_like(output)
delta_w, delta_b, delta_x = conv.backward(delta_a)

print("\nWeight gradients shape:", delta_w.shape)
print("Bias gradients:", delta_b)
print("Input gradients shape:", delta_x.shape)

=== Mini-batch Test ===
Forward output shape: (3, 3, 4)
First sample output:
 [[14. 24. 30. 22.]
 [14. 24. 30. 22.]
 [14. 24. 30. 22.]]

Weight gradients shape: (3, 2, 3)
Bias gradients: [12. 12. 12.]
Input gradients shape: (3, 2, 4)


In [25]:
import numpy as np

class Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        """
        Args:
            in_channels: Number of input channels
            out_channels: Number of output channels
            kernel_size: Size of the convolution kernel
            stride: Stride of the convolution (can be any positive integer)
            padding: Number of zeros to pad on each side
        """
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        # 
        self.w = np.random.randn(out_channels, in_channels, kernel_size) * 0.1
        self.b = np.random.randn(out_channels) * 0.1
        
        # 
        self.input_shape = None
        self.x_padded = None
        self.windows = None 
    
    def forward(self, x):
        """Forward pass with arbitrary stride support"""
        batch_size = x.shape[0]
        
        # 
        if self.padding > 0:
            self.x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding)), mode='constant')
        else:
            self.x_padded = x
        
        self.input_shape = x.shape
        
        num_features = x.shape[2]
        num_output_features = ((num_features + 2*self.padding - self.kernel_size) // self.stride) + 1
        
        output = np.zeros((batch_size, self.out_channels, num_output_features))
        self.windows = np.zeros((batch_size, num_output_features, self.in_channels, self.kernel_size))
        
        # 
        for i in range(num_output_features):
            start = i * self.stride
            window = self.x_padded[:, :, start:start+self.kernel_size]
            self.windows[:, i, :, :] = window
            
            #
            output[:, :, i] = np.tensordot(window, self.w, axes=([1,2], [1,2])) + self.b
        
        return output

    def backward(self, delta_a):
        """Backward pass with arbitrary stride support"""
        batch_size = delta_a.shape[0]
        delta_w = np.zeros_like(self.w)
        delta_b = np.sum(delta_a, axis=(0, 2))
        
        #
        delta_x_padded = np.zeros_like(self.x_padded)
        
        num_output_features = delta_a.shape[2]
        
        #
        delta_w = np.tensordot(
            delta_a.transpose(1, 0, 2).reshape(self.out_channels, -1),
            self.windows.reshape(-1, self.in_channels * self.kernel_size),
            axes=([1], [0])
        ).reshape(self.w.shape)
        
        #
        for i in range(num_output_features):
            start = i * self.stride
            # 
            delta_x_padded[:, :, start:start+self.kernel_size] += np.einsum(
                'bo,oik->bik', 
                delta_a[:, :, i], 
                self.w
            )
        
        #
        if self.padding > 0:
            delta_x = delta_x_padded[:, :, self.padding:-self.padding]
        else:
            delta_x = delta_x_padded
        
        return delta_w, delta_b, delta_x

#
print("=== Test 1: Stride=1 ===")
x1 = np.array([[[1,2,3,4], [5,6,7,8]]], dtype=np.float64)  
conv1 = Conv1d(2, 3, kernel_size=3, stride=1, padding=1)
conv1.w = np.ones_like(conv1.w)
conv1.b = np.zeros_like(conv1.b)
out1 = conv1.forward(x1)
print("Output shape (stride=1):", out1.shape)

print("\n=== Test 2: Stride=2 ===")
x2 = np.array([[[1,2,3,4,5], [6,7,8,9,10]]], dtype=np.float64)  
conv2 = Conv1d(2, 3, kernel_size=3, stride=2, padding=1)
conv2.w = np.ones_like(conv2.w)
conv2.b = np.zeros_like(conv2.b)
out2 = conv2.forward(x2)
print("Output shape (stride=2):", out2.shape)

print("\n=== Test 3: Stride=3 ===")
x3 = np.array([[[1,2,3,4,5,6], [7,8,9,10,11,12]]], dtype=np.float64)  
conv3 = Conv1d(2, 3, kernel_size=3, stride=3, padding=1)
conv3.w = np.ones_like(conv3.w)
conv3.b = np.zeros_like(conv3.b)
out3 = conv3.forward(x3)
print("Output shape (stride=3):", out3.shape)


print("\n=== Backward Pass Test ===")
delta_a = np.ones_like(out3)
delta_w, delta_b, delta_x = conv3.backward(delta_a)
print("Weight gradients shape:", delta_w.shape)
print("Bias gradients:", delta_b)
print("Input gradients shape:", delta_x.shape)

=== Test 1: Stride=1 ===
Output shape (stride=1): (1, 3, 4)

=== Test 2: Stride=2 ===
Output shape (stride=2): (1, 3, 3)

=== Test 3: Stride=3 ===
Output shape (stride=3): (1, 3, 2)

=== Backward Pass Test ===
Weight gradients shape: (3, 2, 3)
Bias gradients: [2. 2. 2.]
Input gradients shape: (1, 2, 6)


In [None]:
import numpy as np
from keras.datasets import mnist
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# MNIST
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 1, 28*28).astype(np.float32) / 255.0  # (60000, 1, 784)
X_test = X_test.reshape(-1, 1, 28*28).astype(np.float32) / 255.0     # (10000, 1, 784)

# 
encoder = OneHotEncoder(sparse=False)
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

#
class HeInitializer:
    def W(self, n_nodes1, n_nodes2):
        return np.random.randn(n_nodes1, n_nodes2) * np.sqrt(2/n_nodes1)
    def B(self, n_nodes2):
        return np.zeros(n_nodes2)

class SGD:
    def __init__(self, lr):
        self.lr = lr
    def update(self, layer):
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB

#
class Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        #
        self.w = np.random.randn(out_channels, in_channels, kernel_size) * 0.1
        self.b = np.random.randn(out_channels) * 0.1
        
        #
        self.input_shape = None
        self.x_padded = None
    
    def forward(self, x):
        batch_size = x.shape[0]
        
        # Apply padding
        if self.padding > 0:
            self.x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding)), mode='constant')
        else:
            self.x_padded = x
        
        self.input_shape = x.shape
        
        num_features = x.shape[2]
        num_output_features = ((num_features + 2*self.padding - self.kernel_size) // self.stride) + 1
        
        output = np.zeros((batch_size, self.out_channels, num_output_features))
        
        for i in range(num_output_features):
            start = i * self.stride
            window = self.x_padded[:, :, start:start+self.kernel_size]
            output[:, :, i] = np.sum(window[:, np.newaxis, :, :] * self.w[np.newaxis, :, :, :], axis=(2,3)) + self.b
        
        return output
    
    def backward(self, delta_a):
        batch_size = delta_a.shape[0]
        delta_w = np.zeros_like(self.w)
        delta_b = np.sum(delta_a, axis=(0, 2))
        
        delta_x_padded = np.zeros_like(self.x_padded)
        num_output_features = delta_a.shape[2]
        
        for i in range(num_output_features):
            start = i * self.stride
            window = self.x_padded[:, :, start:start+self.kernel_size]
            
            # Weight gradients
            delta_w += np.sum(delta_a[:, :, i][:, :, np.newaxis, np.newaxis] * window[:, np.newaxis, :, :], axis=0)
            
            # Input gradients
            delta_x_padded[:, :, start:start+self.kernel_size] += np.sum(
                delta_a[:, :, i][:, :, np.newaxis] * self.w[np.newaxis, :, :, :], axis=1)
        
        # Remove padding
        if self.padding > 0:
            delta_x = delta_x_padded[:, :, self.padding:-self.padding]
        else:
            delta_x = delta_x_padded
        
        return delta_w, delta_b, delta_x

#
class FC:
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
        self.optimizer = optimizer
        self.X = None
    
    def forward(self, X):
        self.X = X
        return X @ self.W + self.B
    
    def backward(self, dA):
        dZ = dA @ self.W.T
        self.dB = np.sum(dA, axis=0)
        self.dW = self.X.T @ dA
        self.optimizer.update(self)
        return dZ

#
class ReLU:
    def forward(self, A):
        self.A = A
        return np.maximum(0, A)
    
    def backward(self, dZ):
        return dZ * (self.A > 0)

class Softmax:
    def __init__(self):
        self.Z = None
    
    def forward(self, X):
        exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))
        self.Z = exp_X / np.sum(exp_X, axis=1, keepdims=True)
        return self.Z
    
    def backward(self, Y):
        batch_size = Y.shape[0]
        return self.Z[:batch_size] - Y

# 
class CNNClassifier:
    def __init__(self, learning_rate=0.01, batch_size=100):
        self.batch_size = batch_size
        self.conv = Conv1d(in_channels=1, out_channels=4, kernel_size=7, stride=1, padding=3)  # Changed stride to 1
        self.fc1 = FC(4*784, 128, HeInitializer(), SGD(learning_rate))  # Changed to 4*784
        self.fc2 = FC(128, 10, HeInitializer(), SGD(learning_rate))
        self.relu = ReLU()
        self.softmax = Softmax()
    
    def forward(self, X):
        # Conv1d forward
        conv_out = self.conv.forward(X)  # Output shape: (batch, 4, 784)
        
        # Flatten for FC layers
        flattened = conv_out.reshape(X.shape[0], -1)  # Flatten all but batch dimension
        
        # FC layers
        A1 = self.fc1.forward(flattened)
        Z1 = self.relu.forward(A1)
        A2 = self.fc2.forward(Z1)
        Z2 = self.softmax.forward(A2)
        return Z2
    
    def backward(self, Y):
        batch_size = Y.shape[0]
        dA2 = self.softmax.backward(Y)
        dZ1 = self.fc2.backward(dA2)
        dA1 = self.relu.backward(dZ1)
        dX = self.fc1.backward(dA1)
        
        # Reshape back to conv output dimensions
        dX_reshaped = dX.reshape(batch_size, 4, 784)
        return dX_reshaped
    
    def train(self, X, y, epochs=5):
        for epoch in range(epochs):
            
            indices = np.random.permutation(len(X))
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            for i in range(0, len(X), self.batch_size):
                batch_X = X_shuffled[i:i+self.batch_size]
                batch_y = y_shuffled[i:i+self.batch_size]
                
                # Forward pass
                output = self.forward(batch_X)
                
                # Backward pass
                self.backward(batch_y)
            
            # Calculate accuracy after each epoch
            pred = self.predict(X_test)
            acc = accuracy_score(y_test, pred)
            print(f"Epoch {epoch+1}, Accuracy: {acc:.4f}")
    
    def predict(self, X):
        proba = self.forward(X)
        return np.argmax(proba, axis=1)

# Initialize and train
cnn = CNNClassifier(learning_rate=0.01, batch_size=100)
cnn.train(X_train, y_train_onehot, epochs=5)

# 
y_pred = cnn.predict(X_test)
final_acc = accuracy_score(y_test, y_pred)
print(f"\nFinal Test Accuracy: {final_acc:.4f}")



Epoch 1, Accuracy: 0.0982
Epoch 2, Accuracy: 0.1135
Epoch 3, Accuracy: 0.1010
Epoch 4, Accuracy: 0.1135
Epoch 5, Accuracy: 0.1010

Final Test Accuracy: 0.1010
