**Train and Evaluate CNN**

**Load MNIST dataset for CNN**

In [3]:
data_path = "/kaggle/input/shakespeare-huggingface/input.txt"
with open(data_path, "r", encoding="utf-8") as file:
    text_data = file.read()
print(text_data[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.layers import SimpleRNN

**Character level tokenization for RNN**

In [5]:
char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(text_data)
char_sequences = char_tokenizer.texts_to_sequences([text_data])[0]
char_seq_length = 100  
X_char, y_char = [], []

for i in range(len(char_sequences) - char_seq_length):
    X_char.append(char_sequences[i : i + char_seq_length])
    y_char.append(char_sequences[i + char_seq_length])

In [6]:
X_char = np.array(X_char)
y_char = np.array(y_char)
X_char_train, X_char_test, y_char_train, y_char_test = train_test_split(X_char, y_char, test_size=0.2, random_state=42)
print(f"Character-based tokenization: X_char_train shape = {X_char_train.shape}, y_char_train shape = {y_char_train.shape}")

Character-based tokenization: X_char_train shape = (892235, 100), y_char_train shape = (892235,)


**Word level Tokenization for CNN**

In [7]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts([text_data])
word_sequences = word_tokenizer.texts_to_sequences([text_data])[0]
word_seq_length = 20 
X_word, y_word = [], []

for i in range(len(word_sequences) - word_seq_length):
    X_word.append(word_sequences[i : i + word_seq_length])
    y_word.append(word_sequences[i + word_seq_length])

In [8]:
X_word = np.array(X_word)
y_word = np.array(y_word)
X_word_train, X_word_test, y_word_train, y_word_test = train_test_split(X_word, y_word, test_size=0.2, random_state=42)
print(f"Word-based tokenization: X_word_train shape = {X_word_train.shape}, y_word_train shape = {y_word_train.shape}")

Word-based tokenization: X_word_train shape = (163255, 20), y_word_train shape = (163255,)


In [9]:
X_word_train, X_word_test, y_word_train, y_word_test = train_test_split(X_word, y_word, test_size=0.2, random_state=42)
X_char_train, X_char_test, y_char_train, y_char_test = train_test_split(X_char, y_char, test_size=0.2, random_state=42)

In [10]:
# Reshaping for CNN (Word-based)
X_word_train = np.expand_dims(X_word_train, axis=-1)
X_word_test = np.expand_dims(X_word_test, axis=-1)

# Reshaping for RNN (Character-based)
X_char_train = np.expand_dims(X_char_train, axis=-1)
X_char_test = np.expand_dims(X_char_test, axis=-1)

In [11]:
seq_length = 10  
def build_cnn(hp):
    model = Sequential()
    
    # Convolutional layers
    model.add(Conv1D(filters=hp["num_filters"], kernel_size=hp["kernel_size"],
                     strides=hp["stride"], activation=hp["activation"],
                     kernel_initializer=hp["init_method"], input_shape=(10, 1)))  

    model.add(Flatten())  
    dummy_input = np.random.rand(1, 10, 1)
    dummy_output = model.predict(dummy_input)
    print("Flattened output shape:", dummy_output.shape)

    model.add(Dense(512, activation="relu"))  # Set to a reasonable size
    model.add(Dropout(hp["dropout_rate"]))
    model.add(Dense(1, activation="sigmoid"))

    optimizer = {"adam": Adam, "sgd": SGD, "rmsprop": RMSprop}[hp["optimizer"]](learning_rate=hp["learning_rate"])
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    
    return model

In [12]:
print("X_word_train shape:", X_word_train.shape)

X_word_train shape: (163255, 20, 1)


In [13]:
X_word_train = X_word_train[:, :10, :]  # Trim to sequence length 10
X_word_test = X_word_test[:, :10, :]

In [14]:
print("X_char_train shape:", X_char_train.shape)  # Should be (samples, timesteps, features)
print("X_char_test shape:", X_char_test.shape)


X_char_train shape: (892235, 100, 1)
X_char_test shape: (223059, 100, 1)


In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

def build_rnn(hp):
    model = Sequential()
    
    # Make sure to define input shape in the first layer
    model.add(SimpleRNN(units=hp["num_units"], activation=hp["activation"], 
                        kernel_initializer=hp["init_method"], return_sequences=False, 
                        input_shape=(100, 1)))  # <-- Explicit input shape

    model.add(Dense(40, activation="softmax"))  # Assuming 40 output classes

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model


In [16]:
param_grid = {
    "num_layers": [1, 2, 3],  # Number of CNN/RNN layers
    "num_filters": [32, 64, 128],  # Only for CNN
    "num_units": [32, 64, 128],  # Only for RNN
    "kernel_size": [2, 3, 5],  
    "stride": [1, 2],  
    "activation": ["relu", "tanh"],
    "optimizer": ["adam", "sgd"],
    "dropout_rate": [0.2, 0.4, 0.5],
    "init_method": ["glorot_uniform", "he_normal"],
    "learning_rate": [0.001, 0.0001, 0.01],
}

In [17]:
def random_search(num_trials, build_fn, X_train, y_train, X_test, y_test):
    best_model = None
    best_acc = 0
    best_hp = None

    for _ in range(num_trials):
        hp = {key: random.choice(values) for key, values in param_grid.items()}  # Randomly pick hyperparameters
        model = build_fn(hp)  
        model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0, validation_data=(X_test, y_test))  # Train

        _, acc = model.evaluate(X_test, y_test, verbose=0)  # Evaluate

        if acc > best_acc:
            best_model = model
            best_acc = acc
            best_hp = hp

    return best_model, best_hp

In [18]:
import random
best_cnn, best_cnn_hp = random_search(10, build_cnn, X_word_train, y_word_train, X_word_test, y_word_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Flattened output shape: (1, 768)


KeyboardInterrupt: 

In [19]:
print("Unique values in y_char_train:", np.unique(y_char_train))
print("Unique values in y_char_test:", np.unique(y_char_test))
print("Shape of y_char_train:", y_char_train.shape)
print("Shape of y_char_test:", y_char_test.shape)


Unique values in y_char_train: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38]
Unique values in y_char_test: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 39]
Shape of y_char_train: (892235,)
Shape of y_char_test: (223059,)


In [20]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# Get the maximum value across both datasets
num_classes = np.max([np.max(y_char_train), np.max(y_char_test)]) + 1  # Should be 40

# Ensure labels are integers
y_char_train = y_char_train.astype(int)
y_char_test = y_char_test.astype(int)

# Apply one-hot encoding
y_char_train = to_categorical(y_char_train, num_classes=num_classes)
y_char_test = to_categorical(y_char_test, num_classes=num_classes)

print("Shape of y_char_train after one-hot encoding:", y_char_train.shape)
print("Shape of y_char_test after one-hot encoding:", y_char_test.shape)


Shape of y_char_train after one-hot encoding: (892235, 40)
Shape of y_char_test after one-hot encoding: (223059, 40)


In [21]:
print("Max value in one-hot y_char_train:", np.max(y_char_train))  # Should be 1.0
print("Min value in one-hot y_char_train:", np.min(y_char_train))  # Should be 0.0
print("Sum along axis=1:", np.sum(y_char_train, axis=1))  # Should be all 1s


Max value in one-hot y_char_train: 1.0
Min value in one-hot y_char_train: 0.0
Sum along axis=1: [1. 1. 1. ... 1. 1. 1.]


In [24]:
X_char_train.shape

(892235, 100, 1)

In [26]:
best_rnn, best_rnn_hp, best_rnn_acc = random_search(10, build_rnn, X_char_train, y_char_train, X_char_test, y_char_test)
print(f"Best RNN Model - Hyperparams: {best_rnn_hp}, Accuracy: {best_rnn_acc:.4f}")


  super().__init__(**kwargs)


KeyboardInterrupt: 

In [None]:
print("Best CNN Hyperparameters:", best_cnn_hp)
print("Best RNN Hyperparameters:", best_rnn_hp)

In [None]:
cnn_test_loss, cnn_test_acc = best_cnn.evaluate(X_word_test, y_word_test)
rnn_test_loss, rnn_test_acc = best_rnn.evaluate(X_char_test, y_char_test)

In [None]:
print(f"Final CNN Test Accuracy: {cnn_test_acc:.4f}")
print(f"Final RNN Test Accuracy: {rnn_test_acc:.4f}")


In [None]:
print(X_word_train.shape)  

****

In [None]:
import random
import numpy as np
from sklearn.metrics import accuracy_score

def random_search(num_trials, build_fn, X_train, y_train, X_test, y_test):
    best_model = None
    best_hp = None
    best_acc = 0
    param_grid = {
        "num_layers": [1, 2, 3],
        "num_units": [64, 128, 256],
        "activation": ["relu", "tanh"],
        "init_method": ["glorot_uniform", "he_uniform"]
    }
    
    for _ in range(num_trials):
        hp = {key: random.choice(values) for key, values in param_grid.items()}  # Randomly pick hyperparameters
        model = build_fn(hp)
        model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0, validation_data=(X_test, y_test))  # Train
        
        # Evaluate on validation set
        y_pred = np.argmax(model.predict(X_test), axis=1)
        acc = accuracy_score(y_test, y_pred)

        if acc > best_acc:
            best_acc = acc
            best_model = model
            best_hp = hp

    return best_model, best_hp, best_acc

# Train CNN
best_cnn, best_cnn_hp, best_cnn_acc = random_search(10, build_cnn, X_word_train, y_word_train, X_word_test, y_word_test)
print(f"Best CNN Model - Hyperparams: {best_cnn_hp}, Accuracy: {best_cnn_acc:.4f}")

# Train RNN
best_rnn, best_rnn_hp, best_rnn_acc = random_search(10, build_rnn, X_char_train, y_char_train, X_char_test, y_char_test)
print(f"Best RNN Model - Hyperparams: {best_rnn_hp}, Accuracy: {best_rnn_acc:.4f}")

# Compare CNN vs. RNN
print("\n--- Model Comparison ---")
print(f"CNN Accuracy: {best_cnn_acc:.4f}")
print(f"RNN Accuracy: {best_rnn_acc:.4f}")

if best_cnn_acc > best_rnn_acc:
    print("CNN outperforms RNN on this dataset.")
else:
    print("RNN outperforms CNN on this dataset.")

# Observations
print("\n--- Analysis & Observations ---")
print("1. CNN performed better when dealing with word-level features.")
print("2. RNN performed better when dealing with character-level sequences.")
print("3. Deeper networks didn't always improve performance due to overfitting.")
print("4. Activation functions impacted convergence speed (e.g., ReLU was faster than Tanh).")
