<a href="https://colab.research.google.com/github/AbrahamKong/CMPE297-Transformers_and_Finetuning_with_LLMs/blob/main/nanogpt_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building a GPT with TensorFlow

In [8]:

# Data Preprocessing

# 1. Download the tiny shakespeare dataset
import urllib.request

url = "https://raw.githubusercontent.com/AbrahamKong/CMPE297-Transformers_and_Finetuning_with_LLMs/main/data/input.txt"
filename = "input.txt"
urllib.request.urlretrieve(url, filename)

# 2. Read the dataset into a string variable named 'text'
with open('input.txt', 'r', encoding="utf16") as f:
    text = f.read()

# 3. Display the length of the dataset in characters
print("length of dataset in characters: ", len(text))

# 4. Inspect the first 1000 characters of the dataset
print("First 1000 characters of the dataset:")
print(text[:1000])

# 5. Identify and display all the unique characters in the dataset and determine the vocabulary size
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Unique characters in the dataset:")
print(''.join(chars))
print("Vocabulary Size:", vocab_size)


length of dataset in characters:  102434
First 1000 characters of the dataset:
< Shakespeare -- THE COMEDY OF ERRORS >
< from Online Library of Liberty (http://oll.libertyfund.org) >
< Unicode .txt version by Mike Scott (http://www.lexically.net) >
< from "The Complete Works of William Shakespeare" >
< ed. with a glossary by W.J. Craig M.A. >
< (London: Oxford University Press, 1916) >
<STAGE DIR>
<Scene.—Ephesus.>
</STAGE DIR>


<ACT 1>


<SCENE 1>
<A Hall in the Duke's Palace.>
<STAGE DIR>
<Enter Duke, Ægeon, Gaoler, Officers, and other Attendants.>
</STAGE DIR>
<ÆGEON>	<1%>
	Proceed, Solinus, to procure my fall,
	And by the doom of death end woes and all.
</ÆGEON>

<DUKE>	<1%>
	Merchant of Syracusa, plead no more.
	I am not partial to infringe our laws:
	The enmity and discord which of late
	Sprung from the rancorous outrage of your duke
	To merchants, our well-dealing countrymen,
	Who, wanting guilders to redeem their lives,
	Have seal'd his rigorous statutes with their bloods,
	Ex

In [9]:

# Define char2idx and idx2char dictionaries based on the unique characters in the dataset
char2idx = {char: idx for idx, char in enumerate(chars)}
idx2char = {idx: char for char, idx in char2idx.items()}


In [10]:

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# BigramLanguageModel class definition in TensorFlow
class BigramLanguageModel(keras.Model):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is not None:
            loss = tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True)
            loss = tf.reduce_mean(loss)
            return logits, loss
        return logits, None

    # Text generation method
    def generate(self, idx, max_new_tokens):
        generated_tokens = []
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            # Reshape logits to ensure it's of shape [batch_size, num_classes]
            logits = tf.squeeze(logits, axis=1)
            # Use softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)
            # Sample the next token
            idx_next = tf.random.categorical(probs, num_samples=1, dtype=tf.int32)
            generated_tokens.append(idx_next)
            idx = idx_next
        return tf.concat(generated_tokens, axis=1)


In [11]:

# Corrected BigramLanguageModel class definition in TensorFlow
class BigramLanguageModel(keras.Model):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is not None:
            # Ensure logits and targets have the same sequence length
            logits = logits[:, :-1, :]
            loss = tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True)
            loss = tf.reduce_mean(loss)
            return logits, loss
        return logits, None

    # Text generation method remains the same as before
    def generate(self, idx, max_new_tokens):
        generated_tokens = []
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            # Reshape logits to ensure it's of shape [batch_size, num_classes]
            logits = tf.squeeze(logits, axis=1)
            # Use softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)
            # Sample the next token
            idx_next = tf.random.categorical(probs, num_samples=1, dtype=tf.int32)
            generated_tokens.append(idx_next)
            idx = idx_next
        return tf.concat(generated_tokens, axis=1)


In [12]:

# Data Batching

# Function to generate batches of data for training
def get_batch(data, batch_size, block_size):
    # Convert data to indices
    data_idx = [char2idx[char] for char in data]
    data_idx = tf.constant(data_idx, dtype=tf.int32)

    # Calculate the number of blocks
    num_blocks = len(data_idx) // (batch_size * block_size)

    # Truncate data to fit exactly into the number of blocks
    data_idx = tf.reshape(data_idx[:(num_blocks * batch_size * block_size)], (batch_size, -1))

    # Generate batches
    for i in range(0, data_idx.shape[1], block_size):
        x = data_idx[:, i:i+block_size]
        y = data_idx[:, i+1:i+1+block_size-1]  # Adjusted to have sequence length of block_size - 1
        yield x, y


In [13]:

# Training Setup

# Define the loss function
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Define the optimizer
optimizer = tf.keras.optimizers.Adam()

# Training step function
@tf.function
def train_step(model, x, y):
    with tf.GradientTape() as tape:
        logits, loss = model(x, targets=y)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Function to estimate average loss
def estimate_loss(model, data, batch_size, block_size):
    total_loss = 0.0
    total_batches = 0
    for x, y in get_batch(data, batch_size, block_size):
        _, loss = model(x, targets=y)
        total_loss += loss
        total_batches += 1
    return total_loss / total_batches


In [14]:

# Model instantiation
model = BigramLanguageModel(vocab_size=vocab_size)

# Training parameters
batch_size = 64
block_size = 128
epochs = 5

# Main Training Loop
for epoch in range(epochs):
    # Iterate over the batches of the dataset
    for step, (x, y) in enumerate(get_batch(text, batch_size, block_size)):
        loss = train_step(model, x, y)

        # Print loss every 100 steps
        if step % 100 == 0:
            if loss is not None:
                print(f"Epoch {epoch+1}/{epochs}, Step {step}, Loss: {loss.numpy()}")
            else:
                print(f"Epoch {epoch+1}/{epochs}, Step {step}, Loss not computed")


Epoch 1/5, Step 0, Loss: 4.419447898864746
Epoch 2/5, Step 0, Loss: 4.400918483734131
Epoch 3/5, Step 0, Loss: 4.382547378540039
Epoch 4/5, Step 0, Loss: 4.364249229431152
Epoch 5/5, Step 0, Loss: 4.346034526824951
