# Part C

## Loading and Preparing the Dataset

In [36]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import get_file
import numpy as np
import math
from sklearn.model_selection import train_test_split

def load_ptb_dataset(reduce_factor=0.1):
    url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/"
    filenames = ["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt"]
    data = {}
    for name in filenames:
        file_path = get_file(name, url + name)
        with open(file_path, 'r') as f:
            text = f.read().replace('\n', ' <eos> ').split()
            data[name.split('.')[1]] = text[:int(len(text) * reduce_factor)]
    return data

# Load data
data = load_ptb_dataset(reduce_factor=0.1)  # Use 10% of the data

# Tokenize the data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data['train'])

# Convert text to sequences of integers
train_sequences = tokenizer.texts_to_sequences(data['train'])
valid_sequences = tokenizer.texts_to_sequences(data['valid'])
test_sequences = tokenizer.texts_to_sequences(data['test'])


## Preparing the Data for Model Training

In [37]:
def prepare_data(sequences, maxlen=30):
    sequences = [seq for seq in sequences if len(seq) > 1]
    X, y = [], []
    for seq in sequences:
        for i in range(1, len(seq)):
            X.append(seq[:i])
            y.append(seq[i])
    X = pad_sequences(X, maxlen=maxlen)
    y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)
    return np.array(X), np.array(y)

# Prepare training, validation, and test sets
maxlen = 30
X_train, y_train = prepare_data(train_sequences, maxlen=maxlen)
X_valid, y_valid = prepare_data(valid_sequences, maxlen=maxlen)
X_test, y_test = prepare_data(test_sequences, maxlen=maxlen)

# Split training data for hyperparameter tuning
X_train_tune, X_valid_tune, y_train_tune, y_valid_tune = train_test_split(X_train, y_train, test_size=0.2)


## Model Configurations and Definitions

In [38]:
# Model configurations
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
hidden_units = 128
batch_size = 64
epochs = 50

# Define the RNN model
def create_rnn_model(vocab_size, embedding_dim, hidden_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
        tf.keras.layers.SimpleRNN(hidden_units),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    return model

# Define the LSTM model
def create_lstm_model(vocab_size, embedding_dim, hidden_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
        tf.keras.layers.LSTM(hidden_units),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    return model

# Define the GRU model
def create_gru_model(vocab_size, embedding_dim, hidden_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
        tf.keras.layers.GRU(hidden_units),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    return model


## Transformer Model Definition

In [39]:
# Define the Transformer model
def create_transformer_model(vocab_size, embedding_dim, hidden_units):
    inputs = tf.keras.Input(shape=(maxlen,))
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=hidden_units)(x, x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)
    model = tf.keras.Model(inputs, outputs)
    return model

def create_transformer_model(vocab_size, embedding_dim, hidden_units, num_heads=4, num_layers=2):
    inputs = tf.keras.Input(shape=(maxlen,))
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    
    for _ in range(num_layers):
        attn_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(x, x)
        attn_output = tf.keras.layers.Dropout(0.1)(attn_output)
        out1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + attn_output)
        
        ffn_output = tf.keras.Sequential([
            tf.keras.layers.Dense(hidden_units, activation='relu'), 
            tf.keras.layers.Dense(embedding_dim)
        ])(out1)
        ffn_output = tf.keras.layers.Dropout(0.1)(ffn_output)
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)
    
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)
    model = tf.keras.Model(inputs, outputs)
    return model


## Training and Evaluating the Models


In [40]:
# Learning rate scheduler
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

# Compile and train models with hyperparameter tuning
def tune_and_train_model(model_fn, vocab_size, embedding_dim, hidden_units, X_train, y_train, X_valid, y_valid, additional_args={}):
    best_model = None
    best_perplexity = float('inf')
    for lr in [0.001, 0.0005, 0.0001]:
        model = model_fn(vocab_size, embedding_dim, hidden_units, **additional_args)
        learning_rate = CustomSchedule(hidden_units)
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
        
        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
        model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])
        
        loss, _ = model.evaluate(X_valid, y_valid)
        perplexity = math.exp(loss)
        
        if perplexity < best_perplexity:
            best_perplexity = perplexity
            best_model = model
            
    return best_model

# Evaluate models and calculate perplexity
def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
    
    # Calculate perplexity
    perplexity = math.exp(loss)
    print(f'Perplexity: {perplexity}')
    return loss, accuracy, perplexity


## Model Training and Evaluation

In [41]:
# RNN
print("RNN Model Evaluation")
best_rnn_model = tune_and_train_model(create_rnn_model, vocab_size, embedding_dim, hidden_units, X_train_tune, y_train_tune, X_valid_tune, y_valid_tune)
rnn_loss, rnn_accuracy, rnn_perplexity = evaluate_model(best_rnn_model, X_test, y_test)

RNN Model Evaluation
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
E

In [45]:
# LSTM
print("LSTM Model Evaluation")
best_lstm_model = tune_and_train_model(create_lstm_model, vocab_size, embedding_dim, hidden_units, X_train_tune, y_train_tune, X_valid_tune, y_valid_tune)
lstm_loss, lstm_accuracy, lstm_perplexity = evaluate_model(best_lstm_model, X_test, y_test)

LSTM Model Evaluation
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


In [51]:
# GRU
print("GRU Model Evaluation")
best_gru_model = tune_and_train_model(create_gru_model, vocab_size, embedding_dim, hidden_units, X_train_tune, y_train_tune, X_valid_tune, y_valid_tune)
gru_loss, gru_accuracy, gru_perplexity = evaluate_model(best_gru_model, X_test, y_test)

GRU Model Evaluation
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
E

In [44]:
# Create, tune, and train models
print("Transformer Model Evaluation")
best_transformer_model = tune_and_train_model(
    create_transformer_model, 
    vocab_size, 
    embedding_dim, 
    hidden_units, 
    X_train_tune, 
    y_train_tune, 
    X_valid_tune, 
    y_valid_tune, 
    additional_args={'num_heads': 8, 'num_layers': 4}
)
transformer_loss, transformer_accuracy, transformer_perplexity = evaluate_model(best_transformer_model, X_test, y_test)


Transformer Model Evaluation
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch