In [4]:
with open("the_verdict_a_short_story.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [4]:
import tiktoken

In [6]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [9]:
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text_aka_token_ids = tokenizer.encode(raw_text)
encoded_text_aka_token_ids

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [10]:
print(len(encoded_text_aka_token_ids)) # a total of 5145 tokens are available for the short story

5145


In [12]:
sample_text_token_ids = encoded_text_aka_token_ids[50:]

In [15]:
context_size = 4 # 4 being the maximum input text space for predicting the next token.
for i in range(1, context_size + 1):
    context = sample_text_token_ids[:i]
    desired = sample_text_token_ids[i]
    print(context, "----->", desired)

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [27]:
context_size = 4 # 4 being the maximum input text space for predicting the next token.
for i in range(1, context_size + 1):
    context = sample_text_token_ids[:i]
    desired = sample_text_token_ids[i]
    print(tokenizer.decode(context), "----->", tokenizer.decode([desired])) #input target pair preparation

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [1]:
from torch.utils.data import DataLoader, Dataset

In [78]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, context_size, stride):
        self.input_ids = []  #name has to be exactly this one
        self.target_ids = [] #name has to be exactly this one
        token_ids = tokenizer.encode(txt) #tokenizes the text
        for i in range(0, len(token_ids) - context_size, stride):
            input_chunk = token_ids[i : (i + context_size)]
            target_chunk = token_ids[(i + 1) : (i + context_size + 1)]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    

In [79]:
def create_data_loader_v1(txt, batch_size, context_size, stride, shuffle, drop_last, num_workers):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, context_size, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [80]:
# now lets jump onto the examples
with open("the_verdict_a_short_story.txt", "r", encoding="utf-8") as f:
    raw_text_of_short_story = f.read()

In [81]:
dataloader = create_data_loader_v1(raw_text_of_short_story, batch_size=1, context_size=4, stride=1, shuffle=True, drop_last=True, num_workers=0)

In [82]:
import torch

In [83]:
data_iterator = iter(dataloader)
first_batch = next(data_iterator)

In [84]:
print(first_batch)

[tensor([[  11, 9074,   13,  520]]), tensor([[9074,   13,  520, 5493]])]


In [85]:
second_batch = next(data_iterator)

In [86]:
print(second_batch)

[tensor([[ 1871, 12734,   379,  1123]]), tensor([[12734,   379,  1123,  9581]])]


In [87]:
#example of batchsize greater than 1
data_loader = create_data_loader_v1(raw_text_of_short_story, batch_size=8, 
                                   context_size=4, stride=4, shuffle=True, drop_last=True, num_workers=0) #you can use shuffle=False to understand other inputs of this function more intuitively
data_loader = iter(data_loader)
input_token_ids, target_token_token_ids = next(data_loader)
print("inputs: \n", input_token_ids)
print("\nTargets: \n", target_token_token_ids)

#If stride == context_size: no overlap, less redundancy, faster training, but less data.

#If stride < context_size: overlapping chunks, more data but slower, possibly better training.

#If stride = 1: maximum overlap → high data redundancy, slower, useful for small datasets.

inputs: 
 tensor([[ 4119,    81,   621,   339],
        [  351,   326,  1808,   319],
        [ 1781,   314,  4001,   284],
        [  286,   326,    11,  1770],
        [  402,   271, 10899,   338],
        [10899,   438,    69,   623],
        [  772,  2993,   262,   520],
        [   12, 12239,    13,   198]])

Targets: 
 tensor([[   81,   621,   339,   373],
        [  326,  1808,   319,   340],
        [  314,  4001,   284,   466],
        [  326,    11,  1770,    13],
        [  271, 10899,   338, 12036],
        [  438,    69,   623,  1576],
        [ 2993,   262,   520,  5493],
        [12239,    13,   198,   198]])


In [89]:
#creating embeddings
input_ids = torch.tensor([2, 3, 5, 1])

In [92]:
vocab_size = 6
output_dims = 3

In [93]:
torch.manual_seed(123)

<torch._C.Generator at 0x1a86ab13810>

In [94]:
embedding_layer = torch.nn.Embedding(vocab_size, output_dims)

In [95]:
print(embedding_layer.weight) #6*3 weight(randomly initialized) matrics. embeddings of 3 dimensions for all 6 tokens from 0 to 5 respectively

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [99]:
token_embedding_of_input_ids = embedding_layer(input_ids) # 2, 3, 5, 1

In [100]:
print(token_embedding_of_input_ids) # you can verify output for every tokenif of input with [cell 95]output

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [121]:
# add positional embedding with token embedding to get final input embedding that goes into main LLM layers
# lets take a little bit more realistic example

vocab_size = 50257 #or total tokenids
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [122]:
dataloader = create_data_loader_v1(raw_text_of_short_story, 
                                   batch_size=8, context_size = 4, stride=4, shuffle=False, drop_last=True, num_workers=0)

In [123]:
context_size = 4

In [124]:
inputs, targets = next(iter(dataloader)) #printing only next batch

In [125]:
print(" input token_ids ", inputs)

 input token_ids  tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


In [126]:
token_embeddings = token_embedding_layer(inputs)

In [127]:
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [128]:
print(token_embeddings)

tensor([[[-1.8110e+00,  5.9972e-01,  7.1007e-01,  ...,  2.6609e-01,
           6.1868e-01, -3.9322e-01],
         [-1.9088e-01,  7.1345e-02, -3.8122e-02,  ...,  1.4721e-01,
          -3.4023e-01,  1.6866e-01],
         [-5.6576e-02, -6.0120e-01, -2.7563e-01,  ..., -1.2445e+00,
          -2.0547e+00,  1.9596e-01],
         [ 9.8848e-01,  1.1787e-01,  2.8492e-01,  ...,  1.0044e+00,
           4.9365e-02,  1.0088e+00]],

        [[ 2.7329e-02,  4.0165e-01,  1.3347e+00,  ..., -3.1196e-01,
           7.3017e-01,  1.0257e+00],
         [ 7.0987e-01, -3.5913e-01, -7.7819e-01,  ..., -1.1336e+00,
          -7.4470e-01, -9.9091e-01],
         [ 1.1999e-01, -3.1565e-01,  1.6696e-01,  ..., -1.2786e+00,
           3.6759e-01,  6.2119e-02],
         [ 6.3661e-01,  1.0656e+00, -2.9600e-01,  ..., -8.2773e-01,
          -6.9150e-01,  9.8483e-01]],

        [[-7.1078e-01, -5.5337e-02, -2.2731e-01,  ...,  1.4535e+00,
          -6.2174e-01,  1.6584e-01],
         [ 5.6046e-01,  7.4998e-01, -4.4525e-01,  .

In [129]:
# now we have to add absolute positional embedding of size 4by256 to each token of this batch
# we create another embedding
vocab_size = 4
output_dim = 256
positional_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [130]:
input_ids = torch.arange(4)
print(input_ids)

tensor([0, 1, 2, 3])


In [131]:
positional_embeddings = positional_embedding_layer(input_ids)

In [132]:
print(positional_embeddings)

tensor([[ 1.0696, -0.9701, -0.9684,  ..., -0.8631, -0.2154, -0.8643],
        [-1.6282,  0.8200,  0.2141,  ..., -0.9073, -1.2348,  2.0505],
        [-1.0969,  0.9882, -0.7024,  ..., -0.4517,  0.2479, -0.3560],
        [ 2.6964, -1.3863,  0.3334,  ..., -1.2751,  2.3357, -0.8028]],
       grad_fn=<EmbeddingBackward0>)


In [133]:
print(positional_embeddings.shape)

torch.Size([4, 256])


In [135]:
# calculate input embeddings
input_embeddings= token_embeddings + positional_embeddings

In [136]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [137]:
print(input_embeddings)

tensor([[[-0.7414, -0.3704, -0.2583,  ..., -0.5970,  0.4032, -1.2575],
         [-1.8191,  0.8914,  0.1760,  ..., -0.7601, -1.5751,  2.2191],
         [-1.1535,  0.3870, -0.9781,  ..., -1.6962, -1.8068, -0.1600],
         [ 3.6849, -1.2685,  0.6183,  ..., -0.2707,  2.3851,  0.2060]],

        [[ 1.0969, -0.5685,  0.3663,  ..., -1.1750,  0.5147,  0.1615],
         [-0.9184,  0.4609, -0.5641,  ..., -2.0409, -1.9796,  1.0596],
         [-0.9769,  0.6725, -0.5355,  ..., -1.7304,  0.6155, -0.2938],
         [ 3.3330, -0.3208,  0.0374,  ..., -2.1028,  1.6442,  0.1820]],

        [[ 0.3588, -1.0255, -1.1957,  ...,  0.5905, -0.8372, -0.6985],
         [-1.0678,  1.5700, -0.2312,  ...,  0.3648, -1.8144,  2.9857],
         [-1.1819,  0.2522,  1.3410,  ..., -0.4492,  1.8146, -0.8212],
         [ 4.5650, -2.7336,  1.6391,  ..., -2.2011,  2.4834, -1.7435]],

        ...,

        [[ 1.1532, -3.8735, -1.6065,  ..., -1.1901,  0.2299, -1.2118],
         [-0.8675,  2.4911, -1.9971,  ..., -4.8362, -2.39