In [1]:
from torch.utils.data import DataLoader
from functools import partial

In [2]:
def custom_collate_fn(batch, window_size, word_to_ix):
    # Break our batch into the training examples (x) and labels (y)
    # We are turning our x and y into tensors because nn.utils.rnn.pad_sequence
    # method expects tensors. This is also useful since our model will be
    # expecting tensor inputs. 
    x, y = zip(*batch)

    # Now we need to window pad our training examples. We have already defined a 
    # function to handle window padding. We are including it here again so that
    # everything is in one place.
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window

    # Pad the train examples.
    x = [pad_window(s, window_size=window_size) for s in x]

    # Now we need to turn words in our training examples to indices. We are
    # copying the function defined earlier for the same reason as above.
    def convert_tokens_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]

    # Convert the train examples into indices.
    x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

    # We will now pad the examples so that the lengths of all the example in 
    # one batch are the same, making it possible to do matrix operations. 
    # We set the batch_first parameter to True so that the returned matrix has 
    # the batch as the first dimension.
    pad_token_ix = word_to_ix["<pad>"]

    # pad_sequence function expects the input to be a tensor, so we turn x into one
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # We will also pad the labels. Before doing so, we will record the number 
    # of labels so that we know how many words existed in each example. 
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)

    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    # We are now ready to return our variables. The order we return our variables
    # here will match the order we read them in our training loop.
    return x_padded, y_padded, lenghts  

In [3]:
def _custom_collate_fn(batch, window_size, word_to_ix):
    # Prepare the datapoints
    x, y = zip(*batch)  
    x = [pad_window(s, window_size=window_size) for s in x]
    x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

    # Pad x so that all the examples in the batch have the same size
    pad_token_ix = word_to_ix["<pad>"]
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

    # Pad y and record the length
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)
    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    return x_padded, y_padded, lenghts 

In [4]:
# Parameters to be passed to the DataLoader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f"Iteration {counter}")
    print("Batched Input:")
    print(batched_x)
    print("Batched Labels:")
    print(batched_y)
    print("Batched Lengths:")
    print(batched_lengths)
    print("")
    counter += 1

NameError: name 'train_sentences' is not defined

In [5]:
# Print the original tensor
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 


NameError: name 'batched_x' is not defined