In [1]:
import numpy as np

# Initialize random weights and biases
D = 4  # Dimension of input and residual stream
F = 8  # Number of features

W_enc = np.random.randn(F, D)  # Encoder weights
b_enc = np.random.randn(F)     # Encoder biases
W_dec = np.random.randn(D, F)  # Decoder weights
b_dec = np.random.randn(D)     # Decoder biases

# Step 1: Define a sparse feature extraction function
def extract_features(x):
    """ Extract sparse features from input x """
    # Encoder: calculate feature activations
    f_x = np.maximum(0, np.dot(W_enc, x) + b_enc)  # ReLU activation
    return f_x

# Step 2: Define the reconstruction function with residual flow
def reconstruct_with_residual(x, f_x):
    """ Reconstructs x_hat using residual connection and feature activations """
    x_hat = b_dec + np.dot(W_dec, f_x)
    x_residual = x + x_hat  # Residual addition
    return x_residual

# Step 3: Define the loss function with L2 and L1 penalties
def calculate_loss(x, x_residual, f_x, lambda_param=0.1):
    """ Calculate total loss with reconstruction and sparsity penalties """
    # L2 reconstruction loss
    reconstruction_loss = np.sum((x - x_residual) ** 2)
    
    # L1 penalty scaled by decoder weight norms
    feature_penalty = lambda_param * np.sum(f_x * np.linalg.norm(W_dec, axis=0))
    
    # Total loss
    return reconstruction_loss + feature_penalty

# Step 4: Forward pass with feature extraction, residual flow, and loss calculation
x = np.random.randn(D)  # Random input

# Extract features
f_x = extract_features(x)

# Reconstruct with residual
x_residual = reconstruct_with_residual(x, f_x)

# Calculate loss
loss = calculate_loss(x, x_residual, f_x)

print("Input:", x)
print("Feature Activations:", f_x)
print("Reconstructed Output with Residual:", x_residual)
print("Loss:", loss)

# Step 5: Feature Steering
def steer_features(f_x, feature_indices, steer_values):
    """ Adjust specific features to steer the output """
    f_x[feature_indices] = steer_values
    return f_x

# Example: Steer feature 0 and 1 to specific values
f_x_steered = steer_features(f_x, feature_indices=[0, 1], steer_values=[1.0, 0.5])
x_residual_steered = reconstruct_with_residual(x, f_x_steered)
loss_steered = calculate_loss(x, x_residual_steered, f_x_steered)

print("\nSteered Feature Activations:", f_x_steered)
print("Reconstructed Output with Residual (Steered):", x_residual_steered)
print("Loss (Steered):", loss_steered)


Input: [-0.64223429  0.88752081 -0.32236932 -0.57512958]
Feature Activations: [0.95404876 0.61400691 0.         0.         0.         0.
 2.42046216 0.        ]
Reconstructed Output with Residual: [ 0.91340046  7.49750137 -1.73897795 -1.04784908]
Loss: 49.20095554100703

Steered Feature Activations: [1.         0.5        0.         0.         0.         0.
 2.42046216 0.        ]
Reconstructed Output with Residual (Steered): [ 0.73410739  7.38566116 -1.58989494 -1.15259828]
Loss (Steered): 46.89788779312734
