# Optimal Activation Benchmark

In [1]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import boston_housing
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # Added ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import load_iris, load_wine, load_diabetes, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn import datasets
import random
import time # To measure execution time
import warnings # To suppress warnings if needed

# Suppress TensorFlow warnings for cleaner output (optional)
# tf.get_logger().setLevel('ERROR')
# warnings.filterwarnings('ignore')

# --- Configuration ---
N_SEEDS = 5         # Number of random seeds to run for averaging results
BASE_SEED = 42    # Base seed for reproducibility
SEEDS = [BASE_SEED + i for i in range(N_SEEDS)] # List of seeds for multiple runs

# Batch sizes to test
BATCH_SIZES = [16, 32, 64, 128, 256]

# Training parameters
EPOCHS = 200
VALIDATION_SPLIT = 0.2
EARLY_STOPPING_PATIENCE = 20 # Patience for EarlyStopping
REDUCE_LR_PATIENCE = 10     # Patience for ReduceLROnPlateau
REDUCE_LR_FACTOR = 0.2      # Factor to reduce learning rate by

2025-05-13 06:07:38.780852: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747116459.008514      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747116459.071473      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- Seed Setting Function ---
def set_seed(seed=42):
    """Sets random seeds for reproducibility."""
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # Ensure TF uses deterministic operations where possible
    # Note: This might impact performance. Remove if not strictly needed.
    # tf.config.experimental.enable_op_determinism() # Might require specific TF versions/configs

# Set initial seed for data splitting consistency
set_seed(BASE_SEED)

In [3]:
# --- Custom Activation Functions ---
class OptimA(Layer):  # Optimal Activation
    """
    Custom activation layer 'OptimA' with trainable parameters.
    Combines tanh and softplus * sigmoid components.
    """
    def __init__(self, **kwargs):
        super(OptimA, self).__init__(**kwargs)

    def build(self, input_shape):
        """Defines the trainable weights (parameters) of the activation function."""
        self.alpha = self.add_weight(name='alpha', shape=(), initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=(), initializer='ones', trainable=True)
        self.delta = self.add_weight(name='delta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.lambda_ = self.add_weight(name='lambda', shape=(), initializer='ones', trainable=True)
        super(OptimA, self).build(input_shape) # Ensure build is called for the parent

    def call(self, x):
        """Defines the forward pass of the activation function."""
        term1 = self.alpha * tf.math.tanh(self.beta * x)
        term2 = self.gamma * tf.math.softplus(self.delta * x) * tf.math.sigmoid(self.lambda_ * x)
        return term1 + term2

    def get_config(self):
        """Ensures the layer can be saved and loaded."""
        config = super(OptimA, self).get_config()
        # No specific state needs to be added here unless non-weight parameters are used
        return config

class OptimALinear(Layer):  # Optimal Activation (Linear Approximation)
    """
    Custom activation layer 'OptimALinear' using linear approximations
    for tanh, softplus, and sigmoid. Includes trainable parameters.
    """
    def __init__(self, epsilon=1e-5, **kwargs):
        super(OptimALinear, self).__init__(**kwargs)
        self.epsilon = epsilon

    def build(self, input_shape):
        """Defines the trainable weights (parameters) of the activation function."""
        self.alpha = self.add_weight(name='alpha', shape=(), initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=(), initializer='ones', trainable=True)
        self.delta = self.add_weight(name='delta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.lambda_ = self.add_weight(name='lambda', shape=(), initializer='ones', trainable=True)
        super(OptimALinear, self).build(input_shape) # Ensure build is called for the parent

    def call(self, x):
        """Defines the forward pass using linear approximations."""
        # Linear approximation for tanh (clipping)
        term1 = self.alpha * tf.clip_by_value(self.beta * x, -1.0, 1.0)

        # Linear approximations for softplus (ReLU-like) and sigmoid (linear segment)
        # Softplus approx: max(0, val) + epsilon (to avoid potential zero multiplication)
        softplus_approx = tf.maximum(0.0, self.delta * x) + self.epsilon
        # Sigmoid approx near 0: 0.5 + 0.25 * val (first term of Taylor expansion around 0)
        sigmoid_approx = tf.clip_by_value(0.5 + 0.25 * self.lambda_ * x, 0.0, 1.0) # Clip to [0,1]

        term2 = self.gamma * softplus_approx * sigmoid_approx
        return term1 + term2

    def get_config(self):
        """Ensures the layer can be saved and loaded."""
        config = super(OptimALinear, self).get_config()
        config.update({"epsilon": self.epsilon})
        return config

In [4]:
# --- Data Loading and Preparation ---
print("Loading and preparing data...")

# Dictionary to hold dataset configurations
datasets_config = {}

Loading and preparing data...


In [5]:
# 1. Regression: Boston Housing (Note: Boston Housing is often discouraged due to ethical concerns, but kept here as per the original request)
try:
    (x_train_boston, y_train_boston), (x_test_boston, y_test_boston) = boston_housing.load_data(seed=BASE_SEED)
    scaler_boston = StandardScaler()
    x_train_boston = scaler_boston.fit_transform(x_train_boston)
    x_test_boston = scaler_boston.transform(x_test_boston)
    datasets_config['Boston Housing'] = {
        'data': (x_train_boston, y_train_boston, x_test_boston, y_test_boston),
        'task_type': 'regression',
        'metric_name': 'MAE' # Mean Absolute Error
    }
except Exception as e:
    print(f"Warning: Could not load Boston Housing dataset. Skipping. Error: {e}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/boston_housing.npz
[1m57026/57026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [6]:
# 2. Classification: Iris
data_iris = load_iris()
x_train_iris, x_test_iris, y_train_iris, y_test_iris = train_test_split(data_iris.data, data_iris.target, test_size=0.2, random_state=BASE_SEED, stratify=data_iris.target)
scaler_iris = StandardScaler()
x_train_iris = scaler_iris.fit_transform(x_train_iris)
x_test_iris = scaler_iris.transform(x_test_iris)
y_train_iris_cat = to_categorical(y_train_iris) # Keep original for potential different loss functions if needed
y_test_iris_cat = to_categorical(y_test_iris)
datasets_config['Iris'] = {
    'data': (x_train_iris, y_train_iris_cat, x_test_iris, y_test_iris_cat),
    'task_type': 'classification',
    'metric_name': 'Accuracy'
}

In [7]:
# 3. Multiclass Classification: Wine
data_wine = load_wine()
x_train_wine, x_test_wine, y_train_wine, y_test_wine = train_test_split(data_wine.data, data_wine.target, test_size=0.2, random_state=BASE_SEED, stratify=data_wine.target)
scaler_wine = StandardScaler()
x_train_wine = scaler_wine.fit_transform(x_train_wine)
x_test_wine = scaler_wine.transform(x_test_wine)
y_train_wine_cat = to_categorical(y_train_wine)
y_test_wine_cat = to_categorical(y_test_wine)
datasets_config['Wine'] = {
    'data': (x_train_wine, y_train_wine_cat, x_test_wine, y_test_wine_cat),
    'task_type': 'classification',
    'metric_name': 'Accuracy'
}

In [8]:
# 4. Regression: Diabetes Dataset
data_diabetes = load_diabetes()
x_train_diabetes, x_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
    data_diabetes.data, data_diabetes.target, test_size=0.2, random_state=BASE_SEED
)
scaler_diabetes = StandardScaler()
x_train_diabetes = scaler_diabetes.fit_transform(x_train_diabetes)
x_test_diabetes = scaler_diabetes.transform(x_test_diabetes)
datasets_config['Diabetes'] = {
    'data': (x_train_diabetes, y_train_diabetes, x_test_diabetes, y_test_diabetes),
    'task_type': 'regression',
    'metric_name': 'MAE'
}

In [9]:
# 5. Regression: California Housing Dataset
data_california = fetch_california_housing()
x_train_california, x_test_california, y_train_california, y_test_california = train_test_split(
    data_california.data, data_california.target, test_size=0.2, random_state=BASE_SEED
)
scaler_california = StandardScaler()
x_train_california = scaler_california.fit_transform(x_train_california)
x_test_california = scaler_california.transform(x_test_california)
datasets_config['California Housing'] = {
    'data': (x_train_california, y_train_california, x_test_california, y_test_california),
    'task_type': 'regression',
    'metric_name': 'MAE'
}

In [10]:
print("Data loading and preparation complete.")

Data loading and preparation complete.


In [11]:
# --- Model Building and Evaluation Function ---
def build_and_evaluate_model(x_train, y_train, x_test, y_test, activation_instance,
                             task_type="classification", batch_size=32):
    """
    Builds, compiles, trains, and evaluates a simple Sequential model.

    Args:
        x_train: Training features.
        y_train: Training targets.
        x_test: Testing features.
        y_test: Testing targets.
        activation_instance: An instantiated activation layer or a string identifier.
        task_type (str): 'classification' or 'regression'.
        batch_size (int): Batch size for training.

    Returns:
        float: The evaluation metric score (Accuracy for classification, MAE for regression).
               Returns np.nan if training fails.
    """
    # Ensure a new model is created for each call
    model = Sequential()
    model.add(Input(shape=(x_train.shape[1],))) # Use Input layer for explicit shape definition
    model.add(Dense(64, activation=activation_instance)) # Hidden layer

    # Output layer and loss function based on task type
    if task_type == "classification":
        num_classes = y_train.shape[1]
        if num_classes == 1: # Binary classification (should ideally be checked based on unique values in original y)
             # This case isn't used with the current to_categorical preprocessing, but included for completeness
            model.add(Dense(1, activation='sigmoid'))
            loss = 'binary_crossentropy'
            metrics = ['accuracy']
        else:  # Multiclass classification
            model.add(Dense(num_classes, activation='softmax'))
            loss = 'categorical_crossentropy'
            metrics = ['accuracy']
        monitor_metric = 'val_accuracy' # Monitor validation accuracy for callbacks
        eval_metric_index = 1 # metrics list index for accuracy

    elif task_type == "regression":
        model.add(Dense(1)) # Linear output layer
        loss = 'mse' # Mean Squared Error is common for training regression
        metrics = ['mae'] # Mean Absolute Error is often preferred for evaluation
        monitor_metric = 'val_mae' # Monitor validation MAE for callbacks
        eval_metric_index = 1 # metrics list index for MAE
    else:
        raise ValueError(f"Unsupported task type: {task_type}")

    # Compile the model
    optimizer = AdamW(learning_rate=1e-3, beta_1=0.95, beta_2=0.999, amsgrad=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # Define callbacks
    early_stop = EarlyStopping(monitor=monitor_metric, patience=EARLY_STOPPING_PATIENCE,
                               restore_best_weights=True, mode='min' if task_type == 'regression' else 'max')
    reduce_lr = ReduceLROnPlateau(monitor=monitor_metric, factor=REDUCE_LR_FACTOR,
                                  patience=REDUCE_LR_PATIENCE, min_lr=1e-6,
                                  mode='min' if task_type == 'regression' else 'max')

    # Train the model
    try:
        history = model.fit(x_train, y_train,
                          validation_split=VALIDATION_SPLIT,
                          epochs=EPOCHS,
                          batch_size=batch_size,
                          callbacks=[early_stop, reduce_lr],
                          verbose=0) # verbose=0 for cleaner output during loops

        # Evaluate the model on the test set
        results = model.evaluate(x_test, y_test, verbose=0)
        return results[eval_metric_index] # Return the desired metric (Accuracy or MAE)

    except Exception as e:
        print(f"      ! Training/Evaluation failed: {e}")
        return np.nan # Return NaN if an error occurs

In [12]:
# --- Activation Functions Dictionary ---
# We need to instantiate custom layers inside the loop later,
# so we store either the class or string name here.
activations_to_test = {
    'OptimA': OptimA, # Store class
    'OptimALinear': OptimALinear, # Store class
    'ReLU': 'relu',
    'ELU': 'elu',
    'Swish': 'swish',
    'GeLU': 'gelu'
}

# --- Results Storage ---
# Structure: results[dataset_name][activation_name][batch_size] = [list_of_scores_from_seeds]
results = {
    ds_name: {
        act_name: {
            bs: [] for bs in BATCH_SIZES
        } for act_name in activations_to_test.keys()
    } for ds_name in datasets_config.keys()
}

# --- Main Experiment Loop ---
print("\n--- Starting Experiment ---")
start_time = time.time()

for i, seed in enumerate(SEEDS):
    print(f"\n--- Running Seed {i+1}/{N_SEEDS} (Seed: {seed}) ---")
    set_seed(seed) # Set seed for this specific run's TF/Numpy operations

    for ds_name, ds_config in datasets_config.items():
        print(f"  Dataset: {ds_name} ({ds_config['task_type']})")
        x_train, y_train, x_test, y_test = ds_config['data']
        task_type = ds_config['task_type']
        metric_name = ds_config['metric_name']

        for act_name, activation_ref in activations_to_test.items():
            print(f"    Activation: {act_name}")

            for bs in BATCH_SIZES:
                # print(f"      Batch Size: {bs} ... ", end="")
                run_start_time = time.time()

                # Instantiate custom layers here for each trial to reset their state
                if activation_ref == OptimA:
                    activation_instance = OptimA()
                elif activation_ref == OptimALinear:
                    activation_instance = OptimALinear()
                else:
                    activation_instance = activation_ref # Use string name directly

                # Build, train, and evaluate
                score = build_and_evaluate_model(
                    x_train, y_train, x_test, y_test,
                    activation_instance=activation_instance,
                    task_type=task_type,
                    batch_size=bs
                )

                run_end_time = time.time()
                run_duration = run_end_time - run_start_time

                # Store the result
                results[ds_name][act_name][bs].append(score)

                # Print result for this run
                if not np.isnan(score):
                    # print(f"Score ({metric_name}): {score:.4f} (Time: {run_duration:.2f}s)")
                     print(f"      Batch: {bs:<3} | {metric_name}: {score:.4f} | Time: {run_duration:.2f}s")
                else:
                    # print("Failed.")
                     print(f"      Batch: {bs:<3} | Failed.")


total_duration = time.time() - start_time
print(f"\n--- Experiment Finished ---")
print(f"Total time: {total_duration:.2f} seconds ({total_duration/60:.2f} minutes)")


--- Starting Experiment ---

--- Running Seed 1/5 (Seed: 42) ---
  Dataset: Boston Housing (regression)
    Activation: OptimA


I0000 00:00:1747116475.338972      20 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
I0000 00:00:1747116478.928500      58 service.cc:148] XLA service 0x7ac76800e7f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747116478.928948      58 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1747116479.158840      58 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1747116479.815703      58 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


      Batch: 16  | MAE: 3.4083 | Time: 9.54s
      Batch: 32  | MAE: 3.2316 | Time: 6.66s
      Batch: 64  | MAE: 3.3461 | Time: 8.23s
      Batch: 128 | MAE: 3.5783 | Time: 8.68s
      Batch: 256 | MAE: 3.8940 | Time: 10.79s
    Activation: OptimALinear
      Batch: 16  | MAE: 2.3126 | Time: 18.41s
      Batch: 32  | MAE: 2.4301 | Time: 15.40s
      Batch: 64  | MAE: 2.7020 | Time: 13.37s
      Batch: 128 | MAE: 4.2114 | Time: 8.96s
      Batch: 256 | MAE: 4.0601 | Time: 11.96s
    Activation: ReLU
      Batch: 16  | MAE: 2.5436 | Time: 16.75s
      Batch: 32  | MAE: 2.6886 | Time: 12.66s
      Batch: 64  | MAE: 3.3790 | Time: 11.40s
      Batch: 128 | MAE: 4.0395 | Time: 10.61s
      Batch: 256 | MAE: 4.7299 | Time: 10.02s
    Activation: ELU
      Batch: 16  | MAE: 2.4567 | Time: 16.42s
      Batch: 32  | MAE: 3.9160 | Time: 5.68s
      Batch: 64  | MAE: 3.3097 | Time: 11.21s
      Batch: 128 | MAE: 3.9171 | Time: 10.30s
      Batch: 256 | MAE: 4.4725 | Time: 10.20s
    Activation: 

In [13]:
# --- Process and Display Results ---
print("\n--- Processing Results ---")

processed_results = []

for ds_name, ds_results in results.items():
    task_type = datasets_config[ds_name]['task_type']
    metric_name = datasets_config[ds_name]['metric_name']
    # Determine if higher score is better (classification) or lower is better (regression)
    higher_is_better = (task_type == 'classification')

    for act_name, act_results in ds_results.items():
        for bs, scores in act_results.items():
            valid_scores = [s for s in scores if not np.isnan(s)] # Filter out NaNs
            if not valid_scores:
                mean_score, best_score, worst_score = np.nan, np.nan, np.nan
                num_successful_runs = 0
            else:
                mean_score = np.mean(valid_scores)
                num_successful_runs = len(valid_scores)
                if higher_is_better:
                    best_score = np.max(valid_scores)
                    worst_score = np.min(valid_scores)
                else: # Lower is better (MAE)
                    best_score = np.min(valid_scores)
                    worst_score = np.max(valid_scores)

            processed_results.append({
                'Dataset': ds_name,
                'Activation': act_name,
                'Batch Size': bs,
                f'Mean {metric_name}': mean_score,
                f'Best {metric_name}': best_score,
                f'Worst {metric_name}': worst_score,
                'Successful Runs': f"{num_successful_runs}/{N_SEEDS}"
            })

# Create DataFrame
results_df = pd.DataFrame(processed_results)

# Set display options for better readability
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("\n--- Aggregated Results Table ---")
print(f"Metrics averaged over {N_SEEDS} seeds.")
print(f"Note: 'Best' means highest Accuracy for Classification, lowest MAE for Regression.")

# Display sorted results (e.g., by Dataset, then Mean Score)
# Adjust sorting based on what comparison is most important
# Example: Sort by Dataset, then Activation, then Batch Size
results_df_sorted = results_df.sort_values(by=['Dataset', 'Activation', 'Batch Size'])

# Or sort to find the best performing overall (example for classification)
# results_df_sorted = results_df.sort_values(by=['Mean Accuracy'], ascending=False)


--- Processing Results ---

--- Aggregated Results Table ---
Metrics averaged over 5 seeds.
Note: 'Best' means highest Accuracy for Classification, lowest MAE for Regression.


In [14]:
print(results_df_sorted.to_string(index=False, float_format="%.4f"))

           Dataset   Activation  Batch Size  Mean MAE  Best MAE  Worst MAE Successful Runs  Mean Accuracy  Best Accuracy  Worst Accuracy
    Boston Housing          ELU          16    2.7799    2.4567     3.9988             5/5            NaN            NaN             NaN
    Boston Housing          ELU          32    3.5522    2.6957     4.3292             5/5            NaN            NaN             NaN
    Boston Housing          ELU          64    3.4586    3.3097     3.6387             5/5            NaN            NaN             NaN
    Boston Housing          ELU         128    4.0459    3.9171     4.2065             5/5            NaN            NaN             NaN
    Boston Housing          ELU         256    4.4595    4.3622     4.6524             5/5            NaN            NaN             NaN
    Boston Housing         GeLU          16    2.6043    2.5611     2.6328             5/5            NaN            NaN             NaN
    Boston Housing         GeLU          

In [15]:
# Optionally, display results grouped by dataset for clarity
print("\n--- Results Grouped by Dataset ---")
for ds_name in datasets_config.keys():
    print(f"\n--- {ds_name} ---")
    ds_df = results_df[results_df['Dataset'] == ds_name].sort_values(by=['Activation', 'Batch Size'])
    # Sort within the dataset group to find best performance
    metric_col = [col for col in ds_df.columns if col.startswith('Mean ')][0]
    higher_is_better = (datasets_config[ds_name]['task_type'] == 'classification')
    ds_df_sorted = ds_df.sort_values(by=metric_col, ascending=not higher_is_better)
    print(ds_df_sorted.to_string(index=False, float_format="%.4f"))


--- Results Grouped by Dataset ---

--- Boston Housing ---
       Dataset   Activation  Batch Size  Mean MAE  Best MAE  Worst MAE Successful Runs  Mean Accuracy  Best Accuracy  Worst Accuracy
Boston Housing OptimALinear          16    2.3359    2.2925     2.4264             5/5            NaN            NaN             NaN
Boston Housing         GeLU          16    2.6043    2.5611     2.6328             5/5            NaN            NaN             NaN
Boston Housing         ReLU          16    2.6056    2.5436     2.6477             5/5            NaN            NaN             NaN
Boston Housing        Swish          16    2.6105    2.5141     2.6686             5/5            NaN            NaN             NaN
Boston Housing         GeLU          32    2.7630    2.7131     2.8319             5/5            NaN            NaN             NaN
Boston Housing          ELU          16    2.7799    2.4567     3.9988             5/5            NaN            NaN             NaN
Boston Ho