<a href="https://colab.research.google.com/github/Ahmed-Elbagoury/Problem-Solving-Practice/blob/master/load_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is based on https://github.com/rasbt/LLMs-from-scratch/tree/main

In [33]:
from torch.utils.data import Dataset, DataLoader
import torch

In [35]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [36]:
import tiktoken

In [145]:
class MyDataSet(Dataset):

    def __init__(self, txt, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        for i in range (0, len(token_ids)-max_len, stride):
            input_tokens = token_ids[i: i + max_len]
            output_tokens = token_ids[i+ 1: i+max_len + 1]
            self.input_ids.append(torch.tensor(input_tokens))
            self.target_ids.append(torch.tensor(output_tokens))

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {"input": self.input_ids[idx], "target": self.target_ids[idx]}

In [146]:
tokenizer = tiktoken.get_encoding("gpt2")
txt = """This is a test text. This is mostly for testing the tokenization only.
Later text will test training and inferecne
"""
dataset = MyDataSet(txt, tokenizer, 3, 2)

In [147]:
for ip, op in zip(dataset.input_ids, dataset.target_ids):
    print(ip.size(), op.size())
    print(f"'{tokenizer.decode([elem.item() for elem in ip])}'",
          "-->",
          f"'{tokenizer.decode([elem.item() for elem in op])}'")
    print("-------")

torch.Size([3]) torch.Size([3])
'This is a' --> ' is a test'
-------
torch.Size([3]) torch.Size([3])
' a test text' --> ' test text.'
-------
torch.Size([3]) torch.Size([3])
' text. This' --> '. This is'
-------
torch.Size([3]) torch.Size([3])
' This is mostly' --> ' is mostly for'
-------
torch.Size([3]) torch.Size([3])
' mostly for testing' --> ' for testing the'
-------
torch.Size([3]) torch.Size([3])
' testing the token' --> ' the tokenization'
-------
torch.Size([3]) torch.Size([3])
' tokenization only' --> 'ization only.'
-------
torch.Size([3]) torch.Size([3])
' only.
' --> '.
Later'
-------
torch.Size([3]) torch.Size([3])
'
Later text' --> 'Later text will'
-------
torch.Size([3]) torch.Size([3])
' text will test' --> ' will test training'
-------
torch.Size([3]) torch.Size([3])
' test training and' --> ' training and inf'
-------
torch.Size([3]) torch.Size([3])
' and infere' --> ' inferec'
-------
torch.Size([3]) torch.Size([3])
'erecne' --> 'cne
'
-------


In [148]:
def create_dataloader(txt, max_len=256, stride=128, batch_size=4, shuffle=True,
                      drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = MyDataSet(txt, tokenizer, max_len, stride)

    return DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last= drop_last,
                            num_workers= num_workers)

In [154]:
import os
import urllib.request

file_path = "the-verdict.txt"
if not os.path.exists(file_path):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    urllib.request.urlretrieve(url, file_path)

In [155]:
with open(file_path, "r") as f:
    txt = f.read()

In [156]:
# raw_text, batch_size=1, max_length=4, stride=1, shuffle=Fals
loader = create_dataloader(txt=txt, max_len=4, stride=1, batch_size=1,
                           shuffle=False)

In [157]:
data_iter = iter(loader)
first_batch = next(data_iter)
print(first_batch["input"])
print(first_batch["target"])

tensor([[  40,  367, 2885, 1464]])
tensor([[ 367, 2885, 1464, 1807]])


In [158]:
second_batch = next(data_iter)
print(second_batch["input"])
print(second_batch["target"])

tensor([[ 367, 2885, 1464, 1807]])
tensor([[2885, 1464, 1807, 3619]])
