In [1]:
!pip install torch
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

## Install/Import Required Packages

In [2]:
!pip install torch torchvision torchaudio
!pip install datasets
!pip install scikit-learn
!pip install tqdm

import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import random
import math
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from datasets import load_dataset




## 1. Load and Preprocess the Data

In [3]:
dataset = load_dataset("karpathy/tiny_shakespeare")
full_text = dataset["train"][0]["text"]  # entire Shakespeare text

# naive whitespace tokenization
tokens = full_text.strip().split()
vocab = set(tokens)
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)

indexed_tokens = [word2idx[w] for w in tokens]
print("Total tokens:", len(indexed_tokens))
print("Vocabulary size:", vocab_size)

# We originally had: 80% train, 20% test. Let's do that again,
# but also create a validation set (say 10% of the original train).
train_size = 0.8
train_cutoff = int(len(indexed_tokens) * train_size)
train_data_raw = indexed_tokens[:train_cutoff]
test_data = indexed_tokens[train_cutoff:]

# Now create a train/val split from train_data_raw (e.g. 90%/10%).
train_data, val_data = train_test_split(
    train_data_raw, test_size=0.1, shuffle=True, random_state=42
)

print("Train tokens:", len(train_data))
print("Val tokens:", len(val_data))
print("Test tokens:", len(test_data))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

tiny_shakespeare.py:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

The repository for karpathy/tiny_shakespeare contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/karpathy/tiny_shakespeare.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

Total tokens: 182499
Vocabulary size: 23841
Train tokens: 131399
Val tokens: 14600
Test tokens: 36500


## 2. Create PyTorch Datasets and Loaders
We'll make small utility Datasets for next-word prediction with a context of
seq_length=5 for demonstration.

In [4]:

class NextWordDataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_length=5):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx : idx + self.seq_length]
        y = self.data[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def make_dataloader(token_data, batch_size, seq_length=5, shuffle=True):
    ds = NextWordDataset(token_data, seq_length)
    loader = torch.utils.data.DataLoader(ds, batch_size=batch_size,
                                         shuffle=shuffle, drop_last=True)
    return loader

## 3. CNN Model for Next-Word Prediction
We'll implement a simple 1D CNN-based model that:
  - Has an embedding layer
  - Passes the embedding through 1 or more 1D convolution layers
  - Global average pool (or flatten) + FC layer for final logits

Hyperparameters:
  - embed_size
  - num_conv_layers
  - filters (list of filter counts)
  - kernel_size
  - stride
  - dropout
  - activation
  - weight_init

In [5]:
class CNNNextWord(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_size=64,
        num_conv_layers=2,
        filters=[32, 64],
        kernel_size=3,
        stride=1,
        dropout=0.2,
        activation='ReLU'
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.num_conv_layers = num_conv_layers
        self.filters = filters
        self.kernel_size = kernel_size
        self.stride = stride

        self.embedding = nn.Embedding(vocab_size, embed_size)

        act_map = {
            'ReLU': nn.ReLU(),
            'Tanh': nn.Tanh(),
            'Sigmoid': nn.Sigmoid()
        }
        self.activation = act_map[activation]
        padding_amount = kernel_size // 2  # simple "same" approximation for odd kernel

        conv_layers = []
        in_channels = embed_size  # we treat embed_size as the "channel"
        for i in range(num_conv_layers):
            out_channels = filters[i] if i < len(filters) else filters[-1]
            conv_layers.append(
                nn.Conv1d(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding_amount
                )
            )
            conv_layers.append(self.activation)
            conv_layers.append(nn.Dropout(dropout))
            in_channels = out_channels

        self.conv_stack = nn.Sequential(*conv_layers)

        # We'll do a global average pooling across sequence dimension
        # then map to vocab for prediction
        # For a sequence length S, the conv output shape is (B, out_channels, S)
        # We'll average over S, leaving (B, out_channels)

        self.fc = nn.Linear(in_channels, vocab_size)

    def forward(self, x):
        # x shape: (batch_size, seq_length)
        embed = self.embedding(x)  # (B, seq_length, embed_size)
        # We need to swap to (B, embed_size, seq_length) for Conv1D
        embed = embed.permute(0, 2, 1)
        conv_out = self.conv_stack(embed)  # (B, out_channels, seq_length)

        # global average pool across seq_length
        pooled = conv_out.mean(dim=2)      # (B, out_channels)

        logits = self.fc(pooled)           # (B, vocab_size)
        return logits

## 4. RNN Model for Next-Word Prediction

In [6]:
class RNNNextWord(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_size=64,
        hidden_size=128,
        num_layers=1,
        dropout=0.2,
        activation='Tanh'
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        act_map = {
            'Tanh': nn.Tanh(),
            'ReLU': nn.ReLU(),
            'Sigmoid': nn.Sigmoid()
        }
        self.activation_fn = act_map[activation]

        self.embedding = nn.Embedding(vocab_size, embed_size)

        # We'll create multiple "Vanilla RNN" layers
        # (stacked RNN style: output of one layer is the input to the next)
        self.rnn_layers = nn.ModuleList()
        for i in range(num_layers):
            input_dim = embed_size if i == 0 else hidden_size
            layer = nn.Linear(input_dim + hidden_size, hidden_size)
            self.rnn_layers.append(layer)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        # x shape: (batch_size, seq_length)
        batch_size, seq_length = x.shape

        embed = self.embedding(x)  # (B, seq_length, embed_size)

        # We'll maintain hidden state for each layer
        # shape: (num_layers, B, hidden_size)
        hidden_states = [
            embed.new_zeros((batch_size, self.hidden_size))
            for _ in range(self.num_layers)
        ]

        # Unroll over seq_length
        for t in range(seq_length):
            inp_t = embed[:, t, :]  # (B, embed_size)
            # pass through stacked RNN layers
            prev_input = inp_t
            for i, layer in enumerate(self.rnn_layers):
                h_i = hidden_states[i]
                combined = torch.cat((prev_input, h_i), dim=1)  # (B, embed_size + hidden_size) or (hidden_size + hidden_size)
                h_next = self.activation_fn(layer(combined))
                h_next = self.dropout(h_next)
                hidden_states[i] = h_next
                prev_input = h_next

        # final hidden state from top layer
        h_last = hidden_states[-1]  # shape (B, hidden_size)

        logits = self.fc(h_last)    # shape (B, vocab_size)
        return logits

## 5. Weight Initialization
We can define a small utility function to apply different initialization
methods to all linear/conv layers in a model.

In [7]:
def apply_weight_init(model, init_method='xavier'):
    for module in model.modules():
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            if init_method.lower() == 'xavier':
                nn.init.xavier_normal_(module.weight)
            elif init_method.lower() == 'he':
                nn.init.kaiming_normal_(module.weight, nonlinearity='relu')
            elif init_method.lower() == 'normal':
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
            else:
                pass  # default PyTorch init
            if module.bias is not None:
                nn.init.zeros_(module.bias)

## 6. Utility: Train/Evaluate Loops

hyperparams is a dict containing:

      - for both model_class: LR, batch_size, optimizer, activation, ...
      - for CNN: num_conv_layers, filters, kernel_size, stride, dropout, ...
      - for RNN: hidden_size, num_layers, dropout, ...
      - weight_init

In [8]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def compute_accuracy(logits, targets):
    _, preds = torch.max(logits, dim=1)
    correct = (preds == targets).sum().item()
    total = targets.size(0)
    return correct, total

def evaluate(model, data, batch_size=128, seq_length=5, criterion=None):
    loader = make_dataloader(data, batch_size, seq_length, shuffle=False)
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for x_batch, y_batch in loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(x_batch)
            if criterion is not None:
                loss = criterion(logits, y_batch)
                total_loss += loss.item() * x_batch.size(0)

            correct, total = compute_accuracy(logits, y_batch)
            total_correct += correct
            total_samples += total

    if criterion is None:
        avg_loss = None
    else:
        avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples

    return avg_loss, accuracy

def train_one_model(
    model_class,
    train_data,
    val_data,
    hyperparams,
    max_epochs=5,
    seq_length=5
):
   # 1) Build model instance
    model = model_class(**hyperparams["model_kwargs"]).to(device)

    # 2) Apply weight init
    apply_weight_init(model, init_method=hyperparams["weight_init"])

    # 3) Define optimizer
    if hyperparams["optimizer"].lower() == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=hyperparams["lr"])
    elif hyperparams["optimizer"].lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=hyperparams["lr"], momentum=0.9)
    elif hyperparams["optimizer"].lower() == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=hyperparams["lr"])
    else:
        optimizer = optim.Adam(model.parameters(), lr=hyperparams["lr"])

    criterion = nn.CrossEntropyLoss()

    # 4) Create train loader
    train_loader = make_dataloader(train_data,
                                   batch_size=hyperparams["batch_size"],
                                   seq_length=seq_length, shuffle=True)

    best_val_acc = 0.0
    best_model_state = None

    for epoch in range(max_epochs):
        model.train()
        epoch_loss = 0.0
        epoch_correct = 0
        epoch_total = 0

        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(x_batch)
            loss = criterion(logits, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * x_batch.size(0)
            c, t = compute_accuracy(logits, y_batch)
            epoch_correct += c
            epoch_total += t

        train_acc = epoch_correct / epoch_total
        train_loss = epoch_loss / epoch_total

        # Evaluate on val
        val_loss, val_acc = evaluate(model, val_data,
                                     batch_size=hyperparams["batch_size"],
                                     seq_length=seq_length,
                                     criterion=criterion)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()

        # Print or log
        print(f"Epoch [{epoch+1}/{max_epochs}] "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # final val performance
    final_val_loss, final_val_acc = evaluate(model, val_data,
                                             batch_size=hyperparams["batch_size"],
                                             seq_length=seq_length,
                                             criterion=criterion)

    return model, final_val_loss, final_val_acc

Using device: cuda


## 7. Random Search Setup
We define possible search spaces for each model class.

CNN hyperparams:
  - learning rate: [1e-3, 5e-3, 1e-4, ...]
  - batch size: [64, 128]
  - num_conv_layers: [1, 2, 3]
  - filters: possible combos, e.g. [ (32,64), (64,128), (32,64,128), ... ]
  - kernel_size: [3,5]
  - stride: [1,2]
  - activation: ['ReLU','Tanh','Sigmoid']
  - dropout: [0.0, 0.2, 0.5]
  - weight_init: ['xavier','he','normal']
  - optimizer: ['adam','sgd','rmsprop']

RNN hyperparams:
  - learning rate
  - batch size
  - hidden_size: [64,128,256]
  - num_layers: [1,2]
  - dropout
  - activation
  - weight_init
  - optimizer

We'll do a modest search size for demonstration (~ 8-10 random combos each).

In [9]:
import itertools

def sample_cnn_hparams():
    param_space = {
        "lr": [1e-3, 5e-4, 1e-4],
        "batch_size": [64, 128],
        "optimizer": ['adam','sgd','rmsprop'],
        "activation": ['ReLU','Tanh','Sigmoid'],
        "dropout": [0.0, 0.2, 0.5],
        "weight_init": ['xavier','he','normal'],

        "num_conv_layers": [1, 2, 3],
        "filters": [
            [32], [64], [32,64], [64,128], [32,64,128]
        ],
        "kernel_size": [3,5],
        "stride": [1,2],
    }
    # We'll do a random sample from these lists
    return {
        "lr": random.choice(param_space["lr"]),
        "batch_size": random.choice(param_space["batch_size"]),
        "optimizer": random.choice(param_space["optimizer"]),
        "activation": random.choice(param_space["activation"]),
        "dropout": random.choice(param_space["dropout"]),
        "weight_init": random.choice(param_space["weight_init"]),

        "num_conv_layers": random.choice(param_space["num_conv_layers"]),
        "filters": random.choice(param_space["filters"]),
        "kernel_size": random.choice(param_space["kernel_size"]),
        "stride": random.choice(param_space["stride"]),
    }

def sample_rnn_hparams():
    param_space = {
        "lr": [1e-3, 5e-4, 1e-4],
        "batch_size": [64, 128],
        "optimizer": ['adam','sgd','rmsprop'],
        "activation": ['Tanh','ReLU','Sigmoid'],
        "dropout": [0.0, 0.2, 0.5],
        "weight_init": ['xavier','he','normal'],

        "hidden_size": [64, 128, 256],
        "num_layers": [1, 2],
    }
    return {
        "lr": random.choice(param_space["lr"]),
        "batch_size": random.choice(param_space["batch_size"]),
        "optimizer": random.choice(param_space["optimizer"]),
        "activation": random.choice(param_space["activation"]),
        "dropout": random.choice(param_space["dropout"]),
        "weight_init": random.choice(param_space["weight_init"]),

        "hidden_size": random.choice(param_space["hidden_size"]),
        "num_layers": random.choice(param_space["num_layers"]),
    }

def build_cnn_model_kwargs(hparams):
    """
    Translate the random hparams dict into the constructor kwargs
    for CNNNextWord.
    """
    return {
        "vocab_size": vocab_size,
        "embed_size": 50,  # could be a separate param
        "num_conv_layers": hparams["num_conv_layers"],
        "filters": hparams["filters"][:hparams["num_conv_layers"]],
        "kernel_size": hparams["kernel_size"],
        "stride": hparams["stride"],
        "dropout": hparams["dropout"],
        "activation": hparams["activation"]
    }

def build_rnn_model_kwargs(hparams):
    """
    Translate the random hparams dict into constructor kwargs
    for RNNNextWord.
    """
    return {
        "vocab_size": vocab_size,
        "embed_size": 50,  # could also be a separate param
        "hidden_size": hparams["hidden_size"],
        "num_layers": hparams["num_layers"],
        "dropout": hparams["dropout"],
        "activation": hparams["activation"]
    }

## 8. Random Search Loop for CNN

In [10]:
def random_search_cnn(
    n_search=5,   # how many random combos
    max_epochs=5, # how many epochs per trial
    seq_length=5
):
    best_config = None
    best_val_acc = 0.0

    for i in range(n_search):
        hparams = sample_cnn_hparams()

        # Additional checks: we must ensure that
        # len(filters) >= num_conv_layers, or slice the array
        # (we do that in build_cnn_model_kwargs).

        model_kwargs = build_cnn_model_kwargs(hparams)
        trial_params = {
            "model_kwargs": model_kwargs,
            "lr": hparams["lr"],
            "batch_size": hparams["batch_size"],
            "optimizer": hparams["optimizer"],
            "weight_init": hparams["weight_init"],
        }

        print("\n=== CNN Trial {}/{} ===".format(i+1, n_search))
        print("Hyperparams:", hparams)

        model, val_loss, val_acc = train_one_model(
            model_class=CNNNextWord,
            train_data=train_data,
            val_data=val_data,
            hyperparams=trial_params,
            max_epochs=max_epochs,
            seq_length=seq_length
        )

        print(f"Validation Acc = {val_acc:.4f}, Loss = {val_loss:.4f}")
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_config = {
                "hyperparams": hparams,
                "model": model
            }
    return best_config, best_val_acc

## 9. Random Search Loop for RNN

In [11]:
def random_search_rnn(
    n_search=5,
    max_epochs=5,
    seq_length=5
):
    best_config = None
    best_val_acc = 0.0

    for i in range(n_search):
        hparams = sample_rnn_hparams()
        model_kwargs = build_rnn_model_kwargs(hparams)

        trial_params = {
            "model_kwargs": model_kwargs,
            "lr": hparams["lr"],
            "batch_size": hparams["batch_size"],
            "optimizer": hparams["optimizer"],
            "weight_init": hparams["weight_init"],
        }

        print("\n=== RNN Trial {}/{} ===".format(i+1, n_search))
        print("Hyperparams:", hparams)

        model, val_loss, val_acc = train_one_model(
            model_class=RNNNextWord,
            train_data=train_data,
            val_data=val_data,
            hyperparams=trial_params,
            max_epochs=max_epochs,
            seq_length=seq_length
        )

        print(f"Validation Acc = {val_acc:.4f}, Loss = {val_loss:.4f}")
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_config = {
                "hyperparams": hparams,
                "model": model
            }
    return best_config, best_val_acc


## 10. Execute Random Searches

In [12]:

N_SEARCH_CNN = 5
N_SEARCH_RNN = 5
EPOCHS_CNN = 5
EPOCHS_RNN = 5
SEQ_LENGTH = 5

print("\n===== Hyperparameter Search: CNN =====")
best_cnn_config, best_cnn_val_acc = random_search_cnn(
    n_search=N_SEARCH_CNN, max_epochs=EPOCHS_CNN, seq_length=SEQ_LENGTH
)

print("\nBest CNN val accuracy =", best_cnn_val_acc)
print("Best CNN config:", best_cnn_config["hyperparams"])

print("\n===== Hyperparameter Search: RNN =====")
best_rnn_config, best_rnn_val_acc = random_search_rnn(
    n_search=N_SEARCH_RNN, max_epochs=EPOCHS_RNN, seq_length=SEQ_LENGTH
)

print("\nBest RNN val accuracy =", best_rnn_val_acc)
print("Best RNN config:", best_rnn_config["hyperparams"])


===== Hyperparameter Search: CNN =====

=== CNN Trial 1/5 ===
Hyperparams: {'lr': 0.0005, 'batch_size': 64, 'optimizer': 'sgd', 'activation': 'ReLU', 'dropout': 0.5, 'weight_init': 'xavier', 'num_conv_layers': 1, 'filters': [32], 'kernel_size': 3, 'stride': 1}
Epoch [1/5] Train Loss: 9.9425, Train Acc: 0.0246 | Val Loss: 9.7804, Val Acc: 0.0289
Epoch [2/5] Train Loss: 9.5544, Train Acc: 0.0269 | Val Loss: 9.2497, Val Acc: 0.0289
Epoch [3/5] Train Loss: 9.0329, Train Acc: 0.0269 | Val Loss: 8.7923, Val Acc: 0.0289
Epoch [4/5] Train Loss: 8.6850, Train Acc: 0.0269 | Val Loss: 8.5056, Val Acc: 0.0289
Epoch [5/5] Train Loss: 8.4574, Train Acc: 0.0269 | Val Loss: 8.3195, Val Acc: 0.0289
Validation Acc = 0.0289, Loss = 8.3195

=== CNN Trial 2/5 ===
Hyperparams: {'lr': 0.0005, 'batch_size': 128, 'optimizer': 'sgd', 'activation': 'ReLU', 'dropout': 0.0, 'weight_init': 'xavier', 'num_conv_layers': 3, 'filters': [64], 'kernel_size': 5, 'stride': 2}
Epoch [1/5] Train Loss: 10.0643, Train Acc: 0.

## 11. Evaluate Best Models on Test Set


In [13]:
best_cnn_model = best_cnn_config["model"]
best_rnn_model = best_rnn_config["model"]

cnn_test_loss, cnn_test_acc = evaluate(
    best_cnn_model, test_data,
    batch_size=best_cnn_config["hyperparams"]["batch_size"],
    seq_length=SEQ_LENGTH,
    criterion=nn.CrossEntropyLoss()
)
rnn_test_loss, rnn_test_acc = evaluate(
    best_rnn_model, test_data,
    batch_size=best_rnn_config["hyperparams"]["batch_size"],
    seq_length=SEQ_LENGTH,
    criterion=nn.CrossEntropyLoss()
)

def perplexity_from_loss(loss):
    return math.exp(loss) if loss is not None else None

cnn_ppl = perplexity_from_loss(cnn_test_loss)
rnn_ppl = perplexity_from_loss(rnn_test_loss)

print("\n====== Final Test Results ======")
print(f"CNN -> Test Loss: {cnn_test_loss:.4f}, Test Acc: {cnn_test_acc:.4f}, Test PPL: {cnn_ppl:.2f}")
print(f"RNN -> Test Loss: {rnn_test_loss:.4f}, Test Acc: {rnn_test_acc:.4f}, Test PPL: {rnn_ppl:.2f}")


CNN -> Test Loss: 8.4642, Test Acc: 0.0272, Test PPL: 4741.87
RNN -> Test Loss: 8.5599, Test Acc: 0.0258, Test PPL: 5217.93
