# Heres how you can build an LLM from scratch

1. we will be using a open source text book for this as a training set

The text book is named: America - A history (by America: Author: Robert Mackenzie)

IN short here's what we would be doing:

1. load the book.
2. split its contents into individual tokens (punchuations or not depends - but in this we wont consider it for the sake of simplicity)
3. then those token and convert it to token ids
4. now we will convert those token ids to token embeddings.
5. then these embedding would be passed onto the TRANSFORMER LAYER
6. then the postprocessing step comes in where we decode the output generated by the llm to convert it into human readable form.

In [22]:
# here we print out first 200 characters of the book that we are going to be using
import re
book = open("america.txt", 'r').read()

preprocessed = re.split(r'([,.?_!"()\']|--|\s)', book)
print(len(preprocessed))
print(preprocessed[:40])

393235
['The', ' ', 'Project', ' ', 'Gutenberg', ' ', 'eBook', ' ', 'of', ' ', 'America:', ' ', 'A', ' ', 'history', '\n', '', ' ', '', ' ', '', ' ', '', ' ', '', '\n', 'This', ' ', 'ebook', ' ', 'is', ' ', 'for', ' ', 'the', ' ', 'use', ' ', 'of', ' ']


In [23]:
all_words = sorted(list(preprocessed))
vocabSize = len(all_words)
print(vocabSize)

393235


In [27]:
# create a vocab - assign unique int ids to each individual token
vocab = { token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 950:
        break


('', 28837)
('\n', 46661)
(' ', 205333)
('!', 205365)
('#53314]', 205366)
('&c', 205367)
('(', 205381)
(')', 205395)
('*', 205730)
('***', 205734)
(',', 214948)
('--', 215453)
('.', 225646)
('000', 225680)
('000;', 225682)
('1', 225686)
('10', 225688)
('100', 225692)
('101', 225697)
('102', 225699)
('103', 225701)
('103;', 225702)
('104', 225703)
('105', 225709)
('106', 225711)
('107', 225718)
('108', 225723)
('109', 225726)
('11', 225729)
('110', 225734)
('111', 225737)
('112', 225739)
('112;', 225740)
('113', 225741)
('114', 225742)
('115', 225748)
('116', 225750)
('117', 225754)
('118', 225756)
('119', 225757)
('11th', 225758)
('12', 225761)
('120', 225763)
('120;', 225764)
('121', 225777)
('122', 225779)
('123', 225783)
('124', 225785)
('125', 225786)
('127', 225793)
('127;', 225794)
('128', 225796)
('129', 225798)
('12th', 225801)
('12½d', 225802)
('13', 225803)
('130', 225807)
('131', 225809)
('132', 225812)
('133', 225815)
('134', 225818)
('136', 225821)
('137', 225823)
('138', 

In [None]:

# make a tokenizer

class Tokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab #this variable gets the value of the vocab we created
        self.int_to_str = {i:s for s,i in vocab.items()} # this maps the numbers to string

    def encode(self, text): 
        """this as said will take string as inp and return int mapping for it"""
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids): 
        """this does exact opposite of encode -> it converts number back to its string mapping"""
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!()\'])', r'\1', text)
        return text