- <https://jaykmody.com/blog/gpt-from-scratch/>
- <https://www.youtube.com/watch?v=kCc8FmEb1nY&list=WL&index=11&t=528s>

In [None]:
%pip install numpy

In [48]:
import numpy
from typing import List

In [59]:
VOCABULARY = ["not", "all", "the", "heroes", "wear", "capes", ".", "" ]

class Tokenizer:

    def get_index(self, word):
        index = VOCABULARY.index(word)

        return index

    def encode(self, word_list: List[str]):
        return list(map(lambda word: self.get_index(word) , word_list)) 

    def decode(self, tokens: List[str]):
        words: List[str] = []

        for token in tokens:
            words.append(VOCABULARY[token])

        return words

In [60]:
# 2D array (array[i][j])

#              ["all", "not", "heroes", "the", "wear", ".", "capes"]
# output[0] =  [0.75    0.1     0.0       0.15    0.0   0.0    0.0  ]
# given just "not", the model predicts the word "all" with the highest probability

#              ["all", "not", "heroes", "the", "wear", ".", "capes"]
# output[1] =  [0.0     0.0      0.8     0.1    0.0    0.0   0.1  ]
# given the sequence ["not", "all"], the model predicts the word "heroes" with the highest probability

def gpt(input_tokens: List[int]):
  output: List[List[str]] = []

  for token in input_tokens:
    if(token == 0):
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       1,        0,       0,      0,      0,     0,     0 ])
    elif(token == 1):
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       0,        1,       0,      0,      0,     0,     0  ])
    elif(token == 2):
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       0,        0,       1,      0,      0,     0,     0  ])
    elif(token == 3):
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       0,        0,       0,      1,      0,     0,     0  ])
    elif(token == 4):
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       0,        0,       0,      0,      1,     0,     0  ])
    elif(token == 5):
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       0,        0,       0,      0,      0,     1,     0  ])
    else:
      #              not      all       the      heroes  wear    capes  .      ""
      output.append([0,       0,        0,       0,      0,      0,     0,     1  ])
  
  return output

## Tokenization Phase

In [61]:
# Tokenizer that will convert words into tokens and vice-versa, based on the available vocabulary

tokenizer = Tokenizer()

input_tokens = tokenizer.encode(["not", "all"])

print(input_tokens)

[0, 1]


## Decode prediction for next word in string

In [62]:
output = gpt(input_tokens)

# Get the token index of the most probable token coming after the last token in the input string
next_token_id = numpy.argmax(output[-1])

## Guess a whole sentence

In [63]:

def generate(input_tokens: List[int], n_tokens_to_generate: int):
    result_tokens = input_tokens.copy()

    for _ in range(n_tokens_to_generate): # auto-regressive decode loop
        output = gpt(result_tokens)

        next_id = numpy.argmax(output[-1]) # greedy sampling

        result_tokens = numpy.append(result_tokens, [next_id]) # append prediction to input

  
    return result_tokens

input_ids = tokenizer.encode(["not", "all"])
output_tokens = generate(input_ids, 9)

print(output_tokens)

output_sentence = tokenizer.decode(output_tokens)

print(output_sentence)

[0 1 2 3 4 5 6 7 7 7 7]
['not', 'all', 'the', 'heroes', 'wear', 'capes', '.', '', '', '', '']
