In [13]:
class LinearLayer:
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weights = np.random.randn(output_dim, input_dim)
        self.biases = np.zeros(output_dim)
        
    def forward(self, inputs):
        self.inputs = inputs
        linear_output = np.dot(inputs, self.weights.T) + self.biases
        activated_output = np.maximum(0, linear_output)  # ReLU activation function
        return activated_output
        
    def backward(self, grad_output):
        grad_input = np.dot(grad_output, self.weights)
        grad_weights = np.dot(grad_output.T, self.inputs).T
        grad_biases = np.sum(grad_output, axis=0)
        # derivative of ReLU activation function
        derivative = np.zeros_like(self.inputs)
        derivative[self.inputs > 0] = 1
        grad_input *= derivative
        return grad_input, grad_weights, grad_biases
        
    def update(self, grad_weights, grad_biases, learning_rate):
        self.weights -= learning_rate * grad_weights.T
        self.biases -= learning_rate * grad_biases
        
    def softmax(self, inputs):
        exp_inputs = np.exp(inputs)
        return exp_inputs / np.sum(exp_inputs, axis=1, keepdims=True)
    
    def cross_entropy_loss(self, inputs, targets):
        softmax_inputs = self.softmax(inputs)
        num_samples = inputs.shape[0]
        loss = -np.sum(targets * np.log(softmax_inputs)) / num_samples
        grad_output = (softmax_inputs - targets) / num_samples
        return loss, grad_output

In [17]:
input_dim = 4
output_dim = 3
batch_size = 2



# create a linear layer
linear_layer = LinearLayer(input_dim, output_dim)
print(linear_layer.weights)

# generate some random inputs
inputs = np.random.randn(batch_size, input_dim)

for i in range(3):
    # compute the forward pass
    outputs = linear_layer.forward(inputs)

    # ACHTUNG: RANDOM STATT GRAD_OUTPUT VOM MODEL
    # compute the gradients with respect to the outputs
    grad_output = np.random.randn(batch_size, output_dim)

    # compute the gradients with respect to the inputs, weights, and biases
    grad_input, grad_weights, grad_biases = linear_layer.backward(grad_output)

    # update the weights and biases
    learning_rate = 0.1
    linear_layer.update(grad_weights, grad_biases, learning_rate)

    print(linear_layer.weights)

[[ 1.02262214  1.43792123  0.12519207  0.74436076]
 [-1.57822203  0.23939582 -1.19330818  0.57173171]
 [ 1.19186998 -0.71064691 -1.12131597  0.42942155]]
[[ 0.97167818  1.31236165  0.01443683  0.77490577]
 [-1.29644826  0.45173199 -0.76412721  0.41060842]
 [ 1.15163955 -0.63462864 -1.14214153  0.45070073]]
[[ 0.84523496  1.41709272 -0.1020658   0.84396299]
 [-1.2894713   0.35157285 -0.79360216  0.40832928]
 [ 1.08218648 -0.82448386 -1.30024163  0.49264648]]
[[ 0.93579829  1.41168849  0.00785747  0.79337224]
 [-1.14755191  0.49974817 -0.56175495  0.32650828]
 [ 0.95907355 -0.94563417 -1.49855453  0.56350507]]
