In [1]:
# --- Imports ---
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Input, Dropout, LSTM, Embedding, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import AdamW, Adam # Adam is common for LSTMs
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import datasets as hf_datasets # Hugging Face datasets library for easy loading
import numpy as np
import pandas as pd
import random
import time
import os
import gc

# --- Configuration (general and NLP-specific) ---
RAND_SEED = 42
NUM_RUNS_LSTM = 1
EPOCHS_LSTM = 30
PATIENCE_LSTM = 5
BATCH_SIZE_LSTM = 32
MAX_LENGTH_LSTM = 150  # Max sequence length for LSTM
VOCAB_SIZE_LSTM = 20000 # Max vocabulary size
EMBEDDING_DIM_LSTM = 128 # Dimension of word embeddings
LSTM_UNITS_LSTM = 128 # Number of units in LSTM layer
LEARNING_RATE_LSTM = 1e-4 # Common learning rate for Adam with LSTMs

2025-05-12 06:48:30.713144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747032510.967001      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747032511.041888      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def set_seed(seed=42):
    """Sets random seeds for reproducibility."""
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(RAND_SEED)

In [3]:
# --- Custom Activation Layers (OptimA and OptimALinear) ---
class OptimA(Layer):
    """Custom Optimal Activation function."""
    def __init__(self, **kwargs):
        super(OptimA, self).__init__(**kwargs)

    def build(self, input_shape):
        self.alpha = self.add_weight(name='alpha', shape=(), initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=(), initializer='ones', trainable=True)
        self.delta = self.add_weight(name='delta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.lambda_ = self.add_weight(name='lambda', shape=(), initializer='ones', trainable=True)
        super(OptimA, self).build(input_shape)

    def call(self, x):
        term1 = self.alpha * tf.math.tanh(self.beta * x)
        term2 = self.gamma * tf.math.softplus(self.delta * x) * tf.math.sigmoid(self.lambda_ * x)
        return term1 + term2

    def get_config(self):
        config = super(OptimA, self).get_config()
        return config

class OptimALinear(Layer):
    """Custom Optimal Activation function (Linear Approximation)."""
    def __init__(self, epsilon=1e-5, **kwargs):
        super(OptimALinear, self).__init__(**kwargs)
        self.epsilon = epsilon

    def build(self, input_shape):
        self.alpha = self.add_weight(name='alpha', shape=(), initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.gamma = self.add_weight(name='gamma', shape=(), initializer='ones', trainable=True)
        self.delta = self.add_weight(name='delta', shape=(), initializer=tf.keras.initializers.Constant(0.5), trainable=True)
        self.lambda_ = self.add_weight(name='lambda', shape=(), initializer='ones', trainable=True)
        super(OptimALinear, self).build(input_shape)

    def call(self, x):
        term1 = self.alpha * tf.clip_by_value(self.beta * x, -1, 1)
        term2 = self.gamma * (tf.maximum(0.0, self.delta * x) + self.epsilon) * (0.5 + 0.25 * self.lambda_ * x)
        return term1 + term2

    def get_config(self):
        config = super(OptimALinear, self).get_config()
        config.update({'epsilon': self.epsilon})
        return config

# --- Helper to get activation layer ---
def get_activation(act_config):
    if isinstance(act_config, str): return tf.keras.layers.Activation(act_config)
    elif isinstance(act_config, Layer):
        if type(act_config) == OptimA: return OptimA()
        if type(act_config) == OptimALinear: return OptimALinear()
        return act_config
    elif isinstance(act_config, type) and issubclass(act_config, Layer): return act_config()
    else: raise ValueError(f"Unsupported activation: {act_config}")

# --- Data Loading and Preprocessing (LSTM) ---
def load_and_prepare_nlp_data_lstm(dataset_name, vocab_size, max_length, batch_size, seed):
    """Loads and prepares NLP dataset for LSTM, using train and test splits.
       Test split is used for validation/early stopping.
    """
    print(f"Loading dataset: {dataset_name}")
    if dataset_name == "imdb":
        raw_ds = hf_datasets.load_dataset("imdb", trust_remote_code=True)
        text_field = "text"
        label_field = "label"
        num_classes = raw_ds["train"].features[label_field].num_classes
    elif dataset_name == "ag_news":
        raw_ds = hf_datasets.load_dataset("ag_news", trust_remote_code=True)
        text_field = "text"
        label_field = "label"
        num_classes = raw_ds["train"].features[label_field].num_classes
    else:
        raise ValueError(f"Unsupported NLP dataset: {dataset_name}")

    train_texts = [example[text_field] for example in raw_ds['train']]
    train_labels_list = [example[label_field] for example in raw_ds['train']]

    test_texts = [example[text_field] for example in raw_ds['test']]
    test_labels_list = [example[label_field] for example in raw_ds['test']]


    train_labels = np.array(train_labels_list)
    test_labels = np.array(test_labels_list)

    print("Tokenizing texts for LSTM...")
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<oov>")
    tokenizer.fit_on_texts(train_texts)

    train_sequences = tokenizer.texts_to_sequences(train_texts)
    test_sequences = tokenizer.texts_to_sequences(test_texts)

    train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

    train_labels_cat = tf.keras.utils.to_categorical(train_labels, num_classes=num_classes)
    test_labels_cat = tf.keras.utils.to_categorical(test_labels, num_classes=num_classes)

    # Create tf.data.Dataset
    def to_tf_dataset(features, labels, shuffle=False):
        ds = tf.data.Dataset.from_tensor_slices((features, labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(features), seed=seed, reshuffle_each_iteration=True)
        ds = ds.batch(batch_size)
        return ds.prefetch(tf.data.AUTOTUNE)

    tf_train_dataset = to_tf_dataset(train_padded, train_labels_cat, shuffle=True)
    tf_test_dataset = to_tf_dataset(test_padded, test_labels_cat) # This is used for validation now
    
    actual_vocab_size = min(vocab_size, len(tokenizer.word_index) + 1)
    print(f"Actual vocabulary size used: {actual_vocab_size}")

    print(f"Number of classes for {dataset_name}: {num_classes}")
    return tf_train_dataset, tf_test_dataset, num_classes, actual_vocab_size

# --- Model Building (LSTM) ---
def build_lstm_classifier_model(actual_vocab_size_local, embedding_dim_local, lstm_units_local,
                                num_classes_local, activation_func_config, max_length_local, strategy=None):
    if isinstance(activation_func_config, str): activation_name = activation_func_config
    elif hasattr(activation_func_config, '__name__'): activation_name = activation_func_config.__name__
    else: activation_name = activation_func_config.__class__.__name__

    context = strategy.scope() if strategy else tf.device('/CPU:0')

    with context:
        input_layer = Input(shape=(max_length_local,), name='input_sequence')
        embedding_layer = Embedding(input_dim=actual_vocab_size_local,
                                    output_dim=embedding_dim_local,
                                    input_length=max_length_local,
                                    name='embedding')(input_layer)
        lstm_layer = LSTM(lstm_units_local, dropout=0.2, recurrent_dropout=0.2, name='lstm_1')(embedding_layer)
        x = Dropout(0.3, name='dropout_lstm_out')(lstm_layer)
        x = Dense(128, name='dense_pre_activation')(x)
        x = get_activation(activation_func_config)(x)
        x = Dropout(0.3, name='dropout_post_activation')(x)
        output_tensor = Dense(num_classes_local, activation='softmax', name='classifier_output')(x)

        model = Model(inputs=input_layer, outputs=output_tensor, name=f"LSTM_{activation_name}_Classifier")
    return model

# --- Experiment Execution (adapted for LSTM) ---
def run_lstm_experiment(nlp_datasets_config, activations_config, lstm_optimizers_config,
                        num_runs, epochs_lstm_local, batch_size_lstm_local, patience_lstm_local,
                        max_len, vocab_s, embed_dim, lstm_u):
    results = {}
    histories = {}

    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu)
        print('Running on TPU:', tpu.resolver.master())
    except ValueError:
        print("TPU not found. Using default strategy (CPU/GPU).")
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            try:
                strategy = tf.distribute.get_strategy()
                if strategy.num_replicas_in_sync > 1:
                     print(f"Running on {strategy.num_replicas_in_sync} GPU(s) using MirroredStrategy.")
                elif len(gpus) == 1:
                     print(f"Running on 1 GPU.")
                else:
                     print("Running on CPU (default strategy).")
            except RuntimeError as e:
                print(f"Could not initialize MirroredStrategy: {e}. Falling back to default.")
                strategy = tf.distribute.get_strategy()
                print("Running on CPU.")
        else:
            strategy = tf.distribute.get_strategy()
            print("Running on CPU.")
    print(f"Number of accelerators: {strategy.num_replicas_in_sync}")


    for run_idx in range(num_runs):
        print(f"\n--- Starting LSTM Run {run_idx + 1}/{num_runs} ---")
        current_seed = RAND_SEED + run_idx
        set_seed(current_seed)

        for dataset_name, dataset_cfg in nlp_datasets_config.items():
            print(f"\nLSTM Dataset: {dataset_name} (Task: {dataset_cfg['task']})")
            if dataset_name not in results:
                results[dataset_name], histories[dataset_name] = {}, {}

            print(f"Loading and tokenizing {dataset_name} for LSTM run {run_idx + 1}")
            train_ds, test_ds, num_classes_ds, actual_vocab_s = load_and_prepare_nlp_data_lstm(
                dataset_name, vocab_s, max_len, batch_size_lstm_local, current_seed
            )

            loss_f, eval_m, primary_m_name, primary_m_idx = 'categorical_crossentropy', ['accuracy'], 'accuracy', 1

            for opt_name, opt_creator in lstm_optimizers_config.items():
                print(f"  Optimizer: {opt_name} (Run {run_idx + 1})")
                if opt_name not in results[dataset_name]:
                    results[dataset_name][opt_name], histories[dataset_name][opt_name] = {}, {}

                for act_name, act_cfg_val in activations_config.items():
                    print(f"    Activation: {act_name}")
                    if act_name not in results[dataset_name][opt_name]:
                        results[dataset_name][opt_name][act_name] = {'loss': [], primary_m_name: [], 'time': [], 'params': []}
                        histories[dataset_name][opt_name][act_name] = []

                    tf.keras.backend.clear_session(); gc.collect(); set_seed(current_seed)

                    with strategy.scope():
                        model = build_lstm_classifier_model(
                            actual_vocab_s, embed_dim, lstm_u, num_classes_ds,
                            act_cfg_val, max_len, strategy
                        )
                        opt_instance = opt_creator()
                        model.compile(optimizer=opt_instance, loss=loss_f, metrics=eval_m)

                    #  Callbacks use test set
                    mon_metric = 'val_accuracy' # use test for validation
                    cbs = [
                        EarlyStopping(monitor=mon_metric, patience=patience_lstm_local, restore_best_weights=True, verbose=1),
                        ReduceLROnPlateau(monitor=mon_metric, factor=0.2, patience=patience_lstm_local // 2 + 1, min_lr=1e-8, verbose=1)
                    ]

                    start_t = time.time()
                    print(f"Starting LSTM training: {dataset_name}/{opt_name}/{act_name}")
                    hist = model.fit(
                        train_ds,
                        epochs=epochs_lstm_local,
                        validation_data=test_ds, #  Test set is used as validation
                        callbacks=cbs,
                        verbose=1
                    )
                    train_time = time.time() - start_t

                    print(f"RESULTS")
                    loss, accuracy = model.evaluate(test_ds, verbose=0, return_dict=False)
                    print(f"Test Loss: {loss:.4f}")
                    print(f"Test Accuracy: {accuracy:.4f}")
                    
                    eval_l =  hist.history['loss'][-1] if hist.history['loss'] else np.nan # Get the last recorded loss
                    eval_pm = hist.history['accuracy'][-1] if hist.history['accuracy'] else np.nan # Get last accuracy
                    n_params = model.count_params()

                    res_dict = results[dataset_name][opt_name][act_name]
                    res_dict['loss'].append(eval_l); res_dict[primary_m_name].append(eval_pm)
                    res_dict['time'].append(train_time); res_dict['params'].append(n_params)
                    histories[dataset_name][opt_name][act_name].append(hist.history)
                    print(f"      Training Loss: {eval_l:.4f}, Training {primary_m_name.capitalize()}: {eval_pm:.4f}, Time: {train_time:.2f}s, Params: {n_params}")

                    del model, opt_instance, hist; gc.collect()
            del train_ds, test_ds; gc.collect()
    return results, histories

In [4]:
# Ensure internet access for Hugging Face
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

# Define NLP datasets for the benchmark
nlp_datasets_run_cfg_lstm = {
    "imdb": {"task": "classification"},
}
if not nlp_datasets_run_cfg_lstm:
    print("No NLP datasets configured for LSTM. Exiting.")
    exit()

# Activation functions
activations_run_cfg_lstm = {
    'OptimA': OptimA,
    'OptimALinear': OptimALinear,
    'ReLU': 'relu',
    'Tanh': 'tanh',
    'Swish': 'swish',
    'GELU': 'gelu'
}

# Optimizers for LSTM
lstm_optimizers_run_cfg = {
    'AdamW_LSTM': lambda: AdamW(learning_rate=LEARNING_RATE_LSTM, beta_1=0.95, beta_2=0.999, amsgrad=True)
}

print(f"--- LSTM NLP BENCHMARK ---")
print(f"Max Sequence Length: {MAX_LENGTH_LSTM}, Vocab Size: {VOCAB_SIZE_LSTM}")
print(f"Embedding Dim: {EMBEDDING_DIM_LSTM}, LSTM Units: {LSTM_UNITS_LSTM}")
print(f"Batch Size: {BATCH_SIZE_LSTM}, Epochs: {EPOCHS_LSTM}, Number of Runs: {NUM_RUNS_LSTM}")

# Run LSTM experiments
final_lstm_data, all_lstm_hists = run_lstm_experiment(
    nlp_datasets_run_cfg_lstm, activations_run_cfg_lstm, lstm_optimizers_run_cfg,
    num_runs=NUM_RUNS_LSTM, epochs_lstm_local=EPOCHS_LSTM, batch_size_lstm_local=BATCH_SIZE_LSTM,
    patience_lstm_local=PATIENCE_LSTM, max_len=MAX_LENGTH_LSTM, vocab_s=VOCAB_SIZE_LSTM,
    embed_dim=EMBEDDING_DIM_LSTM, lstm_u=LSTM_UNITS_LSTM
)

--- LSTM NLP BENCHMARK ---
Max Sequence Length: 150, Vocab Size: 20000
Embedding Dim: 128, LSTM Units: 128
Batch Size: 32, Epochs: 30, Number of Runs: 1
TPU not found. Using default strategy (CPU/GPU).
Running on 1 GPU.
Number of accelerators: 1

--- Starting LSTM Run 1/1 ---

LSTM Dataset: imdb (Task: classification)
Loading and tokenizing imdb for LSTM run 1
Loading dataset: imdb


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing texts for LSTM...


I0000 00:00:1747032543.520899      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Actual vocabulary size used: 20000
Number of classes for imdb: 2
  Optimizer: AdamW_LSTM (Run 1)
    Activation: OptimA




Starting LSTM training: imdb/AdamW_LSTM/OptimA
Epoch 1/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 243ms/step - accuracy: 0.5038 - loss: 0.7246 - val_accuracy: 0.5846 - val_loss: 0.6888 - learning_rate: 1.0000e-04
Epoch 2/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 243ms/step - accuracy: 0.5848 - loss: 0.6623 - val_accuracy: 0.7975 - val_loss: 0.4472 - learning_rate: 1.0000e-04
Epoch 3/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 243ms/step - accuracy: 0.8195 - loss: 0.4414 - val_accuracy: 0.8201 - val_loss: 0.4080 - learning_rate: 1.0000e-04
Epoch 4/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 242ms/step - accuracy: 0.8672 - loss: 0.3631 - val_accuracy: 0.8397 - val_loss: 0.3837 - learning_rate: 1.0000e-04
Epoch 5/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 244ms/step - accuracy: 0.8912 - loss: 0.3172 - val_accuracy: 0.8419 - val_loss: 0.3927 - learning_