# Optimal Activation Benchmark

In [1]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import boston_housing
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import load_iris, load_wine, load_diabetes, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn import datasets
import random

In [2]:
def set_seed(seed=42):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

# Fix seed
set_seed(42)

In [3]:
# Define custom activation functions
class OptimA(Layer):  # Optimal Activation
    def __init__(self, **kwargs):
        super(OptimA, self).__init__(**kwargs)

    def build(self, input_shape):
        # Defining trainable parameters
        self.alpha = self.add_weight(name='alpha', shape=(), initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=(), initializer='ones', trainable=True)
        self.delta = self.add_weight(name='delta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.lambda_ = self.add_weight(name='lambda', shape=(), initializer='ones', trainable=True)

    def call(self, x):
        term1 = self.alpha * tf.math.tanh(self.beta * x)
        term2 = self.gamma * tf.math.softplus(self.delta * x) * tf.math.sigmoid(self.lambda_ * x)
        return term1 + term2

class OptimALinear(Layer):  # Optimal Activation (Linear Approximation)
    def __init__(self, epsilon=1e-5, **kwargs):
        super(OptimALinear, self).__init__(**kwargs)
        self.epsilon = epsilon

    def build(self, input_shape):
        # Defining trainable parameters
        self.alpha = self.add_weight(name='alpha', shape=(), initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=(), initializer='ones', trainable=True)
        self.delta = self.add_weight(name='delta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.lambda_ = self.add_weight(name='lambda', shape=(), initializer='ones', trainable=True)

    def call(self, x):
        # Linear approximation for tanh
        term1 = self.alpha * tf.clip_by_value(self.beta * x, -1, 1)
        
        # Linear approximations for softplus and sigmoid
        term2 = self.gamma * (tf.maximum(0.0, self.delta * x) + self.epsilon) * (0.5 + 0.25 * self.lambda_ * x)
        
        return term1 + term2

In [4]:
# Load and prepare data for different tasks
# 1. Regression: Boston Housing
(x_train_boston, y_train_boston), (x_test_boston, y_test_boston) = boston_housing.load_data()
scaler_boston = StandardScaler()
x_train_boston = scaler_boston.fit_transform(x_train_boston)
x_test_boston = scaler_boston.transform(x_test_boston)

# 2. Classification: Iris
data_iris = load_iris()
x_train_iris, x_test_iris, y_train_iris, y_test_iris = train_test_split(data_iris.data, data_iris.target, test_size=0.2)
scaler_iris = StandardScaler()
x_train_iris = scaler_iris.fit_transform(x_train_iris)
x_test_iris = scaler_iris.transform(x_test_iris)
y_train_iris = to_categorical(y_train_iris)
y_test_iris = to_categorical(y_test_iris)

# 3. Multiclass Classification: Wine
data_wine = load_wine()
x_train_wine, x_test_wine, y_train_wine, y_test_wine = train_test_split(data_wine.data, data_wine.target, test_size=0.2)
scaler_wine = StandardScaler()
x_train_wine = scaler_wine.fit_transform(x_train_wine)
x_test_wine = scaler_wine.transform(x_test_wine)
y_train_wine = to_categorical(y_train_wine)
y_test_wine = to_categorical(y_test_wine)

# 4. Regression: Diabetes Dataset
data_diabetes = load_diabetes()
x_train_diabetes, x_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
    data_diabetes.data, data_diabetes.target, test_size=0.2, random_state=42
)
scaler_diabetes = StandardScaler()
x_train_diabetes = scaler_diabetes.fit_transform(x_train_diabetes)
x_test_diabetes = scaler_diabetes.transform(x_test_diabetes)

# 5. Regression: California Housing Dataset
data_california = fetch_california_housing()
x_train_california, x_test_california, y_train_california, y_test_california = train_test_split(
    data_california.data, data_california.target, test_size=0.2, random_state=42
)
scaler_california = StandardScaler()
x_train_california = scaler_california.fit_transform(x_train_california)
x_test_california = scaler_california.transform(x_test_california)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/boston_housing.npz
[1m57026/57026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [5]:
# Function to build and evaluate model for a given task
def build_and_evaluate_model(x_train, y_train, x_test, y_test, activation, name, task_type="classification"):
    model = Sequential()
    if task_type == "classification":
        model.add(Dense(64, activation=activation, input_shape=(x_train.shape[1],)))
        if y_train.ndim == 1:  # Check if it's binary classification by the dimension of y_train
            model.add(Dense(1, activation='sigmoid'))
            loss = 'binary_crossentropy'
            metrics = ['accuracy']
        else:  # Multiclass classification
            model.add(Dense(y_train.shape[1], activation='softmax'))
            loss = 'categorical_crossentropy'
            metrics = ['accuracy']
    elif task_type == "regression":
        model.add(Dense(64, activation=activation, input_shape=(x_train.shape[1],)))
        model.add(Dense(1))
        loss = 'mse'
        metrics = ['mae']
    else:
        raise ValueError("Unsupported task type")
    
    model.compile(optimizer=Adam(), loss=loss, metrics=metrics)
    early_stop = EarlyStopping(monitor='val_loss', patience=5)
    history = model.fit(x_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stop], verbose=0)
    
    results = model.evaluate(x_test, y_test, verbose=0)
    return results[1] if task_type == "classification" else results[0]  # accuracy for classification, MAE for regression

In [6]:
# Dictionary of activation functions and results
activations = {
    'OptimA': OptimA(),
    'OptimALinear': OptimALinear(),
    'ReLU': 'relu',
    'ELU': 'elu',
    'Swish': 'swish',
    'GeLU': 'gelu'
}

results = {
    'Boston Housing (Regression)': {},
    'Iris (Classification)': {},
    'Wine (Multiclass Classification)': {},
    'Diabetes (Regression)': {},
    'California Housing (Regression)': {}
}

# Train and evaluate on each task
for name, activation in activations.items():
    print(f"Evaluating {name} activation...")
    results['Boston Housing (Regression)'][name] = build_and_evaluate_model(
        x_train_boston, y_train_boston, x_test_boston, y_test_boston, activation, name, task_type="regression"
    )
    results['Iris (Classification)'][name] = build_and_evaluate_model(
        x_train_iris, y_train_iris, x_test_iris, y_test_iris, activation, name, task_type="classification"
    )
    results['Wine (Multiclass Classification)'][name] = build_and_evaluate_model(
        x_train_wine, y_train_wine, x_test_wine, y_test_wine, activation, name, task_type="classification"
    )
    results['Diabetes (Regression)'][name] = build_and_evaluate_model(
        x_train_diabetes, y_train_diabetes, x_test_diabetes, y_test_diabetes, activation, name, task_type="regression"
    )
    results['California Housing (Regression)'][name] = build_and_evaluate_model(
        x_train_california, y_train_california, x_test_california, y_test_california, activation, name, task_type="regression"
    )

Evaluating OptimA activation...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1730760072.988629      65 service.cc:145] XLA service 0x7e22ac00adc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730760072.988682      65 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1730760073.991342      65 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Evaluating OptimALinear activation...
Evaluating ReLU activation...
Evaluating ELU activation...
Evaluating Swish activation...
Evaluating GeLU activation...


In [7]:
# Display results in a table
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Boston Housing (Regression),Iris (Classification),Wine (Multiclass Classification),Diabetes (Regression),California Housing (Regression)
OptimA,20.355732,1.0,1.0,3021.601807,0.294267
OptimALinear,26.587408,1.0,1.0,3271.583984,0.744984
ReLU,24.586704,0.966667,0.972222,5695.401855,0.323596
ELU,22.702732,1.0,1.0,9833.832031,0.343384
Swish,24.963442,0.966667,1.0,8055.658691,0.350013
GeLU,24.371468,1.0,0.972222,6356.980469,0.323199
