In [4]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchdata.datapipes.iter import IterableWrapper, Mapper
import torchtext

import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import random

  from .autonotebook import tqdm as notebook_tqdm
################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



In [5]:
sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

# Create an instance of your custom dataset
custom_dataset = CustomDataset(sentences)

# Define batch size
batch_size = 2

# Create a DataLoader
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

# Iterate through the DataLoader
for batch in dataloader:
    print(batch)

['You are awesome!', 'It is our choices, Harry, that show what we truly are, far more than our abilities.']
['Soon we must all face the choice between what is right and what is easy.', "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals."]
['Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.', "Fame's a fickle friend, Harry."]


In [6]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x25652b38320>

In [73]:
sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

# Define a custom data set
class CustomDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocab):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.sentences[idx])
        # Convert tokens to tensor indices using vocab
        tensor_indices = [self.vocab[token] for token in tokens]
        return torch.tensor(tensor_indices)

# Tokenizer
tokenizer = get_tokenizer("basic_english")

# Build vocabulary
vocab = build_vocab_from_iterator(map(tokenizer, sentences))

# Create an instance of your custom data set
custom_dataset = CustomDataset(sentences, tokenizer, vocab)

print("Custom Dataset Length:", len(custom_dataset))
print("Sample Items:")
for i in range(6):
    sample_item = custom_dataset[i]
    print(f"Item {i + 1}: {sample_item}")

6lines [00:00, 5824.07lines/s]

Custom Dataset Length: 6
Sample Items:
Item 1: tensor([13, 21, 65, 19, 15,  4,  5, 49,  8, 18, 47,  2, 57,  5, 43, 48, 26, 12,
        45, 63, 11, 46,  2, 16, 11, 35,  3])
Item 2: tensor([37,  8, 18,  5, 40, 42,  2, 10,  3])
Item 3: tensor([14,  7, 17, 33,  2, 10,  2, 59, 55,  4, 20, 64,  6,  2, 38, 51, 58, 17,
        23,  3])
Item 4: tensor([56, 20, 52, 25, 36, 60, 32, 29,  4,  7, 54,  9,  4,  7, 34,  3])
Item 5: tensor([68, 31, 16, 15, 12, 24, 62,  9, 39,  3, 30, 53, 50,  6, 44, 13, 61, 41,
         4, 14, 66, 19, 28, 67,  3])
Item 6: tensor([21,  6, 27, 22])





In [74]:
# Create a custom collate function
def collate_fn(batch):
    # Pad sequences within the batch to have equal lengths
    padded_batch = pad_sequence(batch, batch_first=True, padding_value=0)
    return padded_batch

In [79]:
# Create a data loader with the custom collate function with batch_first=True,
dataloader = DataLoader(custom_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Iterate through the data loader
for batch in dataloader: 
    for row in batch:
        words = [vocab.itos[idx] for idx in row]
        print(words)
        print(len(words))
       

[tensor([13, 21, 65, 19, 15,  4,  5, 49,  8, 18, 47,  2, 57,  5, 43, 48, 26, 12,
        45, 63, 11, 46,  2, 16, 11, 35,  3]), tensor([37,  8, 18,  5, 40, 42,  2, 10,  3])]
['if', 'you', 'want', 'to', 'know', 'what', 'a', 'man', "'", 's', 'like', ',', 'take', 'a', 'good', 'look', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'not', 'his', 'equals', '.']
27
['fame', "'", 's', 'a', 'fickle', 'friend', ',', 'harry', '.', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']
27
[tensor([14,  7, 17, 33,  2, 10,  2, 59, 55,  4, 20, 64,  6,  2, 38, 51, 58, 17,
        23,  3]), tensor([56, 20, 52, 25, 36, 60, 32, 29,  4,  7, 54,  9,  4,  7, 34,  3])]
['it', 'is', 'our', 'choices', ',', 'harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '.']
20
['soon', 'we', 'must', 'all', 'face', 'the', 'choice', 'between', 'what', 'is', 

In [59]:
# Create a custom collate function
def collate_fn_bfFALSE(batch):
    # Pad sequences within the batch to have equal lengths
    padded_batch = pad_sequence(batch, padding_value=0)
    return padded_batch

Now, you look into the curated data:


In [62]:
# Create a data loader with the custom collate function with batch_first=True,
dataloader_bfFALSE = DataLoader(custom_dataset, batch_size=batch_size, collate_fn=collate_fn_bfFALSE)

# Iterate through the data loader
for seq in dataloader_bfFALSE:
    for row in seq:
        #print(row)
        words = [vocab.itos[idx] for idx in row]
        print(words)

['if', 'fame']
['you', "'"]
['want', 's']
['to', 'a']
['know', 'fickle']
['what', 'friend']
['a', ',']
['man', 'harry']
["'", '.']
['s', '<unk>']
['like', '<unk>']
[',', '<unk>']
['take', '<unk>']
['a', '<unk>']
['good', '<unk>']
['look', '<unk>']
['at', '<unk>']
['how', '<unk>']
['he', '<unk>']
['treats', '<unk>']
['his', '<unk>']
['inferiors', '<unk>']
[',', '<unk>']
['not', '<unk>']
['his', '<unk>']
['equals', '<unk>']
['.', '<unk>']
['it', 'soon']
['is', 'we']
['our', 'must']
['choices', 'all']
[',', 'face']
['harry', 'the']
[',', 'choice']
['that', 'between']
['show', 'what']
['what', 'is']
['we', 'right']
['truly', 'and']
['are', 'what']
[',', 'is']
['far', 'easy']
['more', '.']
['than', '<unk>']
['our', '<unk>']
['abilities', '<unk>']
['.', '<unk>']
['youth', 'you']
['can', 'are']
['not', 'awesome']
['know', '!']
['how', '<unk>']
['age', '<unk>']
['thinks', '<unk>']
['and', '<unk>']
['feels', '<unk>']
['.', '<unk>']
['but', '<unk>']
['old', '<unk>']
['men', '<unk>']
['are', '<un

In [63]:
# Iterate through the data loader with batch_first = TRUE
for batch in dataloader:    
    print(batch)
    print("Length of sequences in the batch:",batch.shape[1])

tensor([[13, 21, 65, 19, 15,  4,  5, 49,  8, 18, 47,  2, 57,  5, 43, 48, 26, 12,
         45, 63, 11, 46,  2, 16, 11, 35,  3],
        [37,  8, 18,  5, 40, 42,  2, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0]])
Length of sequences in the batch: 27
tensor([[14,  7, 17, 33,  2, 10,  2, 59, 55,  4, 20, 64,  6,  2, 38, 51, 58, 17,
         23,  3],
        [56, 20, 52, 25, 36, 60, 32, 29,  4,  7, 54,  9,  4,  7, 34,  3,  0,  0,
          0,  0]])
Length of sequences in the batch: 20
tensor([[68, 31, 16, 15, 12, 24, 62,  9, 39,  3, 30, 53, 50,  6, 44, 13, 61, 41,
          4, 14, 66, 19, 28, 67,  3],
        [21,  6, 27, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0]])
Length of sequences in the batch: 25


In [80]:
# Define a custom data set
class CustomDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

In [81]:
custom_dataset=CustomDataset(sentences)

In [82]:
custom_dataset[0]

"If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals."

In [86]:
def collate_fn(batch):
    # Tokenize each sample in the batch using the specified tokenizer
    print(batch)
    tensor_batch = []
    for sample in batch:
        tokens = tokenizer(sample)
        # Convert tokens to vocabulary indices and create a tensor for each sample
        tensor_batch.append(torch.tensor([vocab[token] for token in tokens]))

    # Pad sequences within the batch to have equal lengths using pad_sequence
    # batch_first=True ensures that the tensors have shape (batch_size, max_sequence_length)
    padded_batch = pad_sequence(tensor_batch, batch_first=True)
    
    # Return the padded batch
    return padded_batch

In [87]:
# Create a data loader for the custom dataset
dataloader = DataLoader(
    dataset=custom_dataset,   # Custom PyTorch Dataset containing your data
    batch_size=batch_size,     # Number of samples in each mini-batch
    shuffle=True,              # Shuffle the data at the beginning of each epoch
    collate_fn=collate_fn      # Custom collate function for processing batches
)

In [88]:
for batch in dataloader:
    # print(batch)
    print("shape of sample",len(batch))

['It is our choices, Harry, that show what we truly are, far more than our abilities.', 'You are awesome!']
shape of sample 2
["If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.", "Fame's a fickle friend, Harry."]
shape of sample 2
['Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.', 'Soon we must all face the choice between what is right and what is easy.']
shape of sample 2


As a result, batches of tensors with equal lengths have been successfully created.


## Exercise


Create a data loader with a collate function that processes batches of French text (provided below). Sort the data set on sequences length. Then tokenize, numericalize and pad the sequences. Sorting the sequences will minimize the number of `<PAD>`tokens added to the sequences, which enhances the model's performance. Prepare the data in batches of size 4 and print them.


In [97]:
corpus = [
    "Ceci est une phrase.",
    "C'est un autre exemple de phrase.",
    "Voici une troisième phrase.",
    "Il fait beau aujourd'hui.",
    "J'aime beaucoup la cuisine française.",
    "Quel est ton plat préféré ?",
    "Je t'adore.",
    "Bon appétit !",
    "Je suis en train d'apprendre le français.",
    "Nous devons partir tôt demain matin.",
    "Je suis heureux.",
    "Le film était vraiment captivant !",
    "Je suis là.",
    "Je ne sais pas.",
    "Je suis fatigué après une longue journée de travail.",
    "Est-ce que tu as des projets pour le week-end ?",
    "Je vais chez le médecin cet après-midi.",
    "La musique adoucit les mœurs.",
    "Je dois acheter du pain et du lait.",
    "Il y a beaucoup de monde dans cette ville.",
    "Merci beaucoup !",
    "Au revoir !",
    "Je suis ravi de vous rencontrer enfin !",
    "Les vacances sont toujours trop courtes.",
    "Je suis en retard.",
    "Félicitations pour ton nouveau travail !",
    "Je suis désolé, je ne peux pas venir à la réunion.",
    "À quelle heure est le prochain train ?",
    "Bonjour !",
    "C'est génial !"
]

In [98]:
def collate_fn_fr(batch):
    # Pad sequences within the batch to have equal lengths
    tensor_batch=[]
    for sample in batch:
        tokens = tokenizer(sample)
        tensor_batch.append(torch.tensor([vocab[token] for token in tokens]))
         
    padded_batch = pad_sequence(tensor_batch,batch_first=True)
    return padded_batch

# Build tokenizer
tokenizer = get_tokenizer('basic_english')

# Build vocabulary
vocab = build_vocab_from_iterator(map(tokenizer, corpus))

# Sort sentences based on their length
sorted_data = sorted(corpus, key=lambda x: len(tokenizer(x)))
#print(sorted_data)
dataloader = DataLoader(sorted_data, batch_size=4, shuffle=False, collate_fn=collate_fn_fr)

30lines [00:00, 31528.22lines/s]


In [99]:
for batch in dataloader:
    print(batch)

tensor([[43,  4,  0],
        [42, 34,  4],
        [78, 11,  4],
        [38, 98,  4]])
tensor([[  3,   5,  70,   2,   0],
        [  3,   5,  76,   2,   0],
        [ 45,   7,  14,  13,   2],
        [113,  14, 104,  13,   2]])
tensor([[  3, 102,   6,  30,   2],
        [  3,  20, 100,  21,   2],
        [  3,   5,  17,  97,   2],
        [ 15,   6,   7,  68,   4]])
tensor([[ 93,   7,  23,  88,  91,  10],
        [  8,  64, 118, 115,  44,   4],
        [ 12,  80,  31,  19,  82,   2],
        [ 67,  22,  23,  84,  25,   4]])
tensor([[ 18,  62,  41,  39,   6,  71,   2,   0],
        [ 83,  55,  86, 107,  53,  77,   2,   0],
        [ 19, 109, 101, 103, 105,  49,   2,   0],
        [ 72,   6,  32,  11,  12,  50,  66,   2]])
tensor([[  3, 110,  48,   8,  81,  46,  36,   2,   0],
        [  3,   5,  95,   9, 114,  96,  58,   4,   0],
        [ 26,  94,  69,   7,   8,  89,  24,  10,   0],
        [ 15,   6,   7, 108,  40,  61,   9,  13,   2]])
tensor([[  3,  56,  29,  16,  85,  60,  16,  7

In [105]:
from torchtext.datasets import multi30k,Multi30k
# Multi30k["train"] = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0205EN-SkillsNetwork/training.tar.gz"
# Multi30k["valid"] = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0205EN-SkillsNetwork/validation.tar.gz"

ImportError: cannot import name 'multi30k' from 'torchtext.datasets' (c:\Users\vishwas.balkundi\miniforge3\envs\myenv2\Lib\site-packages\torchtext\datasets\__init__.py)

In [None]:
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

In [None]:
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

In [None]:
data_set = iter(train_iter)

In [None]:
for n in range(5):
    # Getting the next pair of source and target sentences from the training data set
    src, tgt = next(data_set)

    # Printing the source (German) and target (English) sentences
    print(f"sample {str(n+1)}")
    print(f"Source ({SRC_LANGUAGE}): {src}\nTarget ({TGT_LANGUAGE}): {tgt}")

In [None]:
german, english = next(data_set)
print(f"Source German ({SRC_LANGUAGE}): {german}\nTarget English  ({TGT_LANGUAGE}): { english }")

In [None]:
from torchtext.data.utils import get_tokenizer

In [None]:
# Making a placeholder dict to store both tokenizers
token_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

In [None]:
token_transform['de'](german)

In [None]:
token_transform['en'](english)

In [None]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [None]:
#place holder dict for 'en' and 'de' vocab transforms
vocab_transform = {}

In [None]:
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    # Define a mapping to associate the source and target languages
    # with their respective positions in the data samples.
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    # Iterate over each data sample in the provided dataset iterator
    for data_sample in data_iter:
        # Tokenize the data sample corresponding to the specified language
        # and yield the resulting tokens.
        yield token_transform[language](data_sample[language_index[language]])

In [None]:
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data iterator
    train_iterator = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    #To decrease the number of padding tokens, you sort data on the source length to batch similar-length sequences together
    sorted_dataset = sorted(train_iterator, key=lambda x: len(x[0].split()))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(sorted_dataset, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

In [None]:
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [None]:
seq_en=vocab_transform['en'](token_transform['en'](english))
print(f"English text string: {english}\n English sequence: {seq_en}")

seq_de=vocab_transform['de'](token_transform['de'](german))
print(f"German text string: {german}\n German sequence: {seq_de}")


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# function to add BOS/EOS, flip source sentence and create tensor for input sequence indices
def tensor_transform_s(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.flip(torch.tensor(token_ids), dims=(0,)),
                      torch.tensor([EOS_IDX])))

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform_t(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

In [None]:
seq_en=tensor_transform_s(seq_en)
seq_en

In [None]:
seq_de=tensor_transform_t(seq_de)
seq_de

In [None]:
# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}

text_transform[SRC_LANGUAGE] = sequential_transforms(token_transform[SRC_LANGUAGE], #Tokenization
                                            vocab_transform[SRC_LANGUAGE], #Numericalization
                                            tensor_transform_s) # Add BOS/EOS and create tensor

text_transform[TGT_LANGUAGE] = sequential_transforms(token_transform[TGT_LANGUAGE], #Tokenization
                                            vocab_transform[TGT_LANGUAGE], #Numericalization
                                            tensor_transform_t) # Add BOS/EOS and create tensor


In [None]:
# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_sequences = text_transform[SRC_LANGUAGE](src_sample.rstrip("\n"))
        src_sequences = torch.tensor(src_sequences, dtype=torch.int64)
        tgt_sequences = text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n"))
        tgt_sequences = torch.tensor(tgt_sequences, dtype=torch.int64)
        src_batch.append(src_sequences)
        tgt_batch.append(tgt_sequences)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX,batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX,batch_first=True)
    
    return src_batch.to(device), tgt_batch.to(device)


In [None]:
BATCH_SIZE = 4

train_iterator = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
sorted_train_iterator = sorted(train_iterator, key=lambda x: len(x[0].split()))
train_dataloader = DataLoader(sorted_train_iterator, batch_size=BATCH_SIZE, collate_fn=collate_fn,drop_last=True)

valid_iterator = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
sorted_valid_dataloader = sorted(valid_iterator, key=lambda x: len(x[0].split()))
valid_dataloader = DataLoader(sorted_valid_dataloader, batch_size=BATCH_SIZE, collate_fn=collate_fn,drop_last=True)


src, trg = next(iter(train_dataloader))
src,trg

```{## Change Log}
```


```{|Date (YYYY-MM-DD)|Version|Changed By|Change Description|}
```
```{|-|-|-|-|}
```
```{|2023-10-24|0.1|Roodra|Created Lab Template|}
```
