In [None]:
import torch
import tiktoken
import re

In [None]:
with open('verdict.txt') as fd:
    raw_txt = fd.read()

In [None]:
len(raw_txt)

In [None]:
result = re.split(r'(\s)',raw_txt)
result
len(result)

In [None]:
result = re.split(r'([,.]|\s)',raw_txt)
result
len(result)

In [None]:
preprocess = [ r.strip() for r in re.split(r'([,.:;?_!()"\']|--|\s)',raw_txt) if r.strip()]

In [None]:
print(len(preprocess))


In [None]:
all_words = list(set(preprocess))
vocab_size = len(all_words)
vocab_size

In [None]:
all_words.extend(['<|unk|>','<|endoftext|>'])

In [None]:
all_words

In [None]:
vocab = {token:id for id,token in enumerate(all_words)}

In [None]:
vocab['<|unk|>']
vocab['<|endoftext|>']

In [None]:
class SimpleTokenizer_v1:
    def __init__(self,vocab) -> None:
        self.vocab = vocab
        self.un_vocab = [0 for i in range(0,len(vocab))]
        for tok,id in self.vocab.items():
            self.un_vocab[id] = tok

    def encoder(self,raw_txt):
        preprocess = [ r.strip() for r in re.split(r'([,.:;?_!()"\']|--|\s)',raw_txt) if r.strip()]
        return [ self.vocab[tok] if tok in self.vocab else self.vocab['<|unk|>']  for tok in preprocess ]
    
    def decoder(self,decoded):
        return  re.sub(r'\s+([,.:;?_!()"\'])',r'\1',' '.join([self.un_vocab[id]  for id in decoded ])) 

In [None]:
st = SimpleTokenizer_v1(vocab=vocab)

raw_txt_tst = '''I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
'''
decoded = st.encoder(raw_txt=raw_txt)
print(decoded)
raw_txt_tst_new = st.decoder(decoded=decoded)


# raw_txt_tst_new

In [None]:
raw_txt_tst = "Hello, do you like tea. Is this-- a test?"
st.encoder(raw_txt_tst)

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))


st.encoder(text)

In [None]:
tokenizer =  tiktoken.get_encoding("gpt2")

In [None]:
encoded = tokenizer.encode(text,allowed_special={'<|endoftext|>'})

In [None]:
tokenizer.decode(encoded)

In [None]:
with open('verdict.txt', 'r') as f:
    raw_text = f.read()
raw_txt

In [43]:
decoded_txt = tokenizer.encode(raw_txt)
decoded_txt[:10]

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]

In [44]:
cnt_ln = 4
x,y = decoded_txt[:cnt_ln],decoded_txt[1:cnt_ln+1]
print(x,y)

[40, 367, 2885, 1464] [367, 2885, 1464, 1807]


In [46]:
for i in range(1, 10):
    print(decoded_txt[:i],'-->',decoded_txt[i])

[40] --> 367
[40, 367] --> 2885
[40, 367, 2885] --> 1464
[40, 367, 2885, 1464] --> 1807
[40, 367, 2885, 1464, 1807] --> 3619
[40, 367, 2885, 1464, 1807, 3619] --> 402
[40, 367, 2885, 1464, 1807, 3619, 402] --> 271
[40, 367, 2885, 1464, 1807, 3619, 402, 271] --> 10899
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899] --> 2138


In [52]:
from torch.utils.data import Dataset, DataLoader

In [55]:
class LancerDataLoader(Dataset):
    def __init__(self,txt,tok,context_length,stride) -> None:
        self.tok = tok
        self.input_ids = []
        self.taget_ids = []

        ids = self.tok.encode(txt,allowed_special={'<|endoftext|>'})

        for i in range(0,len(ids) - context_length,stride):
            self.input_ids.append(torch.tensor(ids[i:i+context_length]))
            self.taget_ids.append(torch.tensor(ids[i+1:i+context_length+1]))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.taget_ids[idx]

In [77]:
def create_dataloader_v1(text_raw,batch_size=4,context_length=256,stride=128,shuffle=True,drop_last=True):
    tok = tiktoken.get_encoding("gpt2")

    ds = LancerDataLoader(text_raw, tok,context_length,stride)

    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle,drop_last=drop_last)

In [65]:
with open('verdict.txt', 'r') as f:
    txt_raw = f.read()

In [85]:
dl = create_dataloader_v1(txt_raw,context_length=4,stride=1,shuffle=False,batch_size=2)

In [86]:
data_iter = iter(dl)
print(next(data_iter))
print(next(data_iter))


[tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])]
[tensor([[2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]]), tensor([[1464, 1807, 3619,  402],
        [1807, 3619,  402,  271]])]


In [88]:
dl = create_dataloader_v1(txt_raw,context_length=4,stride=4,shuffle=False,batch_size=8)
data_iter = iter(dl)
print(next(data_iter))
print(next(data_iter))

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]
[tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]]), tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
    

In [89]:
input_ids = torch.tensor([5,4,2,1])

In [90]:
vocab_size = 6
output_dim = 3


embed = torch.nn.Embedding(vocab_size, output_dim)
print(embed.weight)

Parameter containing:
tensor([[ 1.3685,  0.5261, -1.0373],
        [-0.8425,  1.5123,  0.6960],
        [ 1.5445,  0.1742, -1.8514],
        [ 0.4560, -0.7771,  1.2037],
        [ 2.3678, -1.9636,  2.0754],
        [-0.0770, -0.4712, -1.4367]], requires_grad=True)


In [92]:
embed(torch.tensor([1]))
embed(input_ids)

tensor([[-0.0770, -0.4712, -1.4367],
        [ 2.3678, -1.9636,  2.0754],
        [ 1.5445,  0.1742, -1.8514],
        [-0.8425,  1.5123,  0.6960]], grad_fn=<EmbeddingBackward0>)

In [93]:
vocab_size = 50257
output_dim = 256

embed = torch.nn.Embedding(vocab_size, output_dim)


In [94]:
context_length = 4
dl = create_dataloader_v1(txt_raw,context_length=context_length,stride=4,shuffle=False,batch_size=8)

In [95]:
diter = iter(dl)
input,target = next(diter)

input_embeding = embed(input)

In [97]:
input_embeding.shape

torch.Size([8, 4, 256])

In [98]:
pos_embed = torch.nn.Embedding(context_length,output_dim)

In [100]:
pos_embed.weight.shape

torch.Size([4, 256])

In [102]:
pos_embed(torch.arange(context_length)).shape

torch.Size([4, 256])

In [104]:
t = input_embeding+pos_embed(torch.arange(context_length))
t.shape


torch.Size([8, 4, 256])