In [None]:
# Download training data text
import numpy as np
from typing import List, Any
with open("./training-data.txt", 'r') as training_data_file:
  training_data = training_data_file.read()

print(f"""Training data from the Tiny Shakespeare dataset):\n\n
{training_data[:4000]}""")

In [29]:
print(f"training data length: {len(training_data)}")

## Get all unique characters from the training_data
# Please note - the reasoning for this is that Andrej Karpathy wants to make a 
# next character predictor. Hence, we need all the characters, not the words.

# Don't actually need list here, as sorted returns a new sorted list given an iterable.
# We add the list(...) function anyway for readability.
unique_characters = sorted(list(set(training_data))) 
print(len(unique_characters))
print(''.join(unique_characters))

training data length: 1115393
65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
# Similar to the Sebastian Raschka video:
# Create a mapping between tokens and token ids, and vice versa
# - although, again, in Andrej Karpathy's case, our tokens are characters, not words

# Simple tokenizer
char_to_id = {char:integer_id for char, integer_id in zip(unique_characters, range(len(unique_characters)))}
id_to_char = {integer_id:char for char, integer_id in char_to_id.items()}
print(char_to_id)
print(id_to_char)

def encode_char_to_id(string : str) -> List[int]:
  return [char_to_id[char] for char in string]

def decode_id_to_char(list_of_chars : List[int]) -> str:
  return ''.join([id_to_char[integer_id] for integer_id in list_of_chars])
#print(unique_characters)

# encode_char_to_id and decode_id_to_char are inverse functions of each other
print(decode_id_to_char(encode_char_to_id(decode_id_to_char(encode_char_to_id(unique_characters)))))

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',