### Get the training data

In [None]:
# Download training data text
import numpy as np
from typing import List, Any
with open("./training-data.txt", 'r') as training_data_file:
  training_data = training_data_file.read()

print(f"""Training data from the Tiny Shakespeare dataset):\n\n
{training_data[:4000]}""")

Training data from the Tiny Shakespeare dataset):


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak th

In [2]:
print(f"training data length: {len(training_data)}")

## Get all unique characters from the training_data
# Please note - the reasoning for this is that Andrej Karpathy wants to make a 
# next character predictor. Hence, we need all the characters, not the words.

# Don't actually need list here, as sorted returns a new sorted list given an iterable.
# We add the list(...) function anyway for readability.
unique_characters = sorted(list(set(training_data))) 
print(len(unique_characters))
print(''.join(unique_characters))

training data length: 1115393
65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


### Create our Tokenizer

In [3]:
# Similar to the Sebastian Raschka video:
# Create a mapping between tokens and token ids, and vice versa
# - although, again, in Andrej Karpathy's case, our tokens are characters, not words

# Simple tokenizer
char_to_id = {char:integer_id for char, integer_id in zip(unique_characters, range(len(unique_characters)))}
id_to_char = {integer_id:char for char, integer_id in char_to_id.items()}
print(char_to_id)
print(id_to_char)

def encode_char_to_id(string : str) -> List[int]:
  return [char_to_id[char] for char in string]

def decode_id_to_char(list_of_chars : List[int]) -> str:
  return ''.join([id_to_char[integer_id] for integer_id in list_of_chars])
#print(unique_characters)

# encode_char_to_id and decode_id_to_char are inverse functions of each other
print(decode_id_to_char(encode_char_to_id(decode_id_to_char(encode_char_to_id(unique_characters)))))

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',

### Tokenize each character of the Tiny Shakespeare dataset

In [4]:
import torch

# Get data as a PyTorch tensor - each character is tokenized
text_as_a_tensor = torch.tensor(encode_char_to_id(training_data), dtype=torch.long)

# Make the tensor a numpy array, then decode each individual number as their respective character.
#print(decode_id_to_char(text_as_a_tensor.numpy()))

### Create train, validation, and test splits. Set up your code environment to integrate a data loader into the process.

In [77]:
text_len = len(text_as_a_tensor) 
train_split = text_as_a_tensor[:int(text_len*0.8)]
val_split = text_as_a_tensor[int(text_len*0.8):int(text_len*0.9)]
test_split = text_as_a_tensor[int(text_len*0.9):]
#print(len(train_split))
#print(len(val_split))
#print(len(test_split))

# The max amount of token ids to take into consideration when predicting output.
# If we were using sub word tokenization, we would be able to consider up to 16 
# sub words. However, since we are doing character based tokenization, we can 
# only consider up to 16 characters when predicting next output. We have to 
# truncate whenever the transformer gets more than {context_length} ids to 
# consider when predicting next output, as that is the upper limit to the 
# context length we set.
context_length = 16
batch_size = 8

print(train_split.shape)
print(val_split.shape)
print(test_split.shape)

torch.Size([892314])
torch.Size([111539])
torch.Size([111540])


### Create dataloader, which randomly obtains a batch of blocks of contiguous characters. The blocks must be context_length in size, and we'll have batch_size number of blocks 

In [78]:
def get_batch(dataset, context_length, batch_size):
  # We assume dataset to be a 1D tensor as of now.

  # len(dataset) - context length means we dont start sampling a block that will
  # be shorter than our context length
  indices_to_begin_a_batch_from = torch.randint(len(dataset) - context_length, (batch_size,))
  inputs = torch.stack([dataset[index:index+context_length] for index in indices_to_begin_a_batch_from], dim=0)
  labels = torch.stack([dataset[index+1:index+context_length+1]
                         for index in indices_to_begin_a_batch_from], dim=0)
  return inputs, labels

inputs, labels = get_batch(train_split, context_length, batch_size)
#print(inputs)
#print(labels)

def visualize_input_label_intuition(inputs, labels, batch_size, context_length):
  for batch_idx in range(batch_size):
    for timestep in range(context_length):
      x = inputs[batch_idx, :timestep+1]
      y = labels[batch_idx, timestep]
      #print(batch_idx)
      #print(timestep)
      #print(f"When the inputs are {x.numpy()}, the label is {y.item()}")

#visualize_input_label_intuition(inputs, labels, batch_size, context_length)

for tensor in inputs:
  print(decode_id_to_char(tensor.numpy()))


handsome stripli


SICINIUS:
This

'Tis torture, a
are men's ends m
MILLO:
Be advise
not now have the
t thought of wha
ir: I have
a kin


### Implement the simplest language model to use with our data: the bigram language model.

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    # vocab_size is the total number of tokens (in this case, our tokens are 
    # characters), that the tokenizer could possibly see.
    super.__init__()

    # In our bigram model, we want to look at the most likely next possible token
    # that could result from our current token. In a trigram model, we may want to
    # link from vocab_size * vocab_size input to vocab_size output. For n-gram, where
    # n >= 2, we probably want mapping from (vocab_size) ^ [n - 1] to vocab_size. For token
    # k_(n-2) k_(n-3) k_(n-4)..., we map from the tokens to input by doing k_i as token id + 
    # vocab_size ^ (i+1) (is the subscript value of k, where k is a token. The greatest token
    # subscript is n-2, and it decreases as we go from left to right).
    self.next_token_embedding_table_ = nn.Embedding(vocab_size, vocab_size)