In [None]:
# The aim of this code is to convert the CSV file of
# labels = missing words, and sentences with missing words
# into a tensor of numbersthat can be passed through
# the matching networks code just like the numpy array
# used for the images in the original code based on the
# Onmiglot dataset

# The numbers in the tensor will be the numbers each word
# refers to in the vocabulary we are building
# We will not embed at this stage because this is
# is done inside the matching network. We are, in effect,
# not completing the TorchText proprocessing

# This code is a mainly a mixture of two tutorials:
# http://anie.me/On-Torchtext/
# https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

# Comments are a mixture of those from the tutorials (most of them)
# and my own

In [1]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torchtext import data
from torchtext.data import Iterator
import spacy

In [2]:
# Check what the data looks like

pd.read_csv("Adrian_data/train.csv").head(20)

Unnamed: 0,label,sentence
0,borgnine,hank azaria who provides the voice of apu comm...
1,borgnine,the unseen person or creature that attacks <bl...
2,borgnine,meanwhile the other junior campers led by er...
3,borgnine,<blank_token> was a guitar player in real life...
4,borgnine,seeing ned flanders get it wrong is great but ...
5,borgnine,rick porter of < unk > N it wrote in that he w...
6,borgnine,meanwhile the other junior campers led by er...
7,borgnine,ernest <blank_token> guest starred in the epis...
8,borgnine,<blank_token> apologized because he felt that ...
9,borgnine,in her book my life as a N year old boy cart...


In [3]:
# Use spacy to define a function to 
# tokenize, or split up, into individual words
# the labels and sentences Note the labels are already
# individual words

spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

# We first define a Field, this is a class that contains
# information on how you want the data preprocessed. It acts
# like an instruction manual that data.TabularDataset will use.
# We define two fields, one for the sentencesm and one for the
# labels

TEXT = data.Field(sequential=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, is_target=True)

In [4]:
# The fields know what to do when given raw data.
# Now, we need to tell the fields what data it
# should work on. This is where we use Datasets.

# The splits method creates a dataset for the train
# and test data by applying the same processing.

train, test = data.TabularDataset.splits(
        path='Adrian_data/', train='train.csv', test='test.csv', format='csv',
        fields=[('label', LABEL), ('sentence', TEXT)])

In [27]:
type(train)

torchtext.data.dataset.TabularDataset

In [5]:
# Torchtext handles mapping words to integers, but
# it has to be told the full range of words it should
# handle. In our case, we probably want to build the
# vocabulary on the training set only, so we run the
# following code

TEXT.build_vocab(train)
LABEL.build_vocab(train)

# This makes torchtext go through all the elements in the
# training set, check the contents corresponding to the TEXT
# field, and register the words in its vocabulary. Torchtext
# has its own class called Vocab for handling the vocabulary.
# The Vocab class holds a mapping from word to id in its stoi
# attribute and a reverse mapping in its itos attribute.

In [6]:
vocab = TEXT.vocab
print(vocab)

<torchtext.vocab.Vocab object at 0x00000233EBF5A668>


In [30]:
# In torchvision and PyTorch, the processing and batching of
# data is handled by DataLoaders. For some reason, torchtext
# has renamed the objects that do the exact same thing to
# Iterators. The basic functionality is the same

train_iter, test_iter = Iterator.splits(
        (train, test), sort_key=lambda x: len(x.Text),
    
        # (9000, 1000) means 9000 for train and 1000 for test
        # That is, we only want to create one "batch" for each
        # as we are only doing this process in TorchText to convert
        # our data into a PyTorch tensor object to be passed around
        # the matching networks program in the same way the
        # vision data was passed around in a numpy array
        # The matching networks program already takes care
        # of batching and we don;t want to distrub things too much
    
        batch_sizes=(9000, 1000)) 

In [31]:
# Look at the batch

# batch = next(train_iter.__iter__()); batch

batch = next(iter(train_iter))
print(batch)


[torchtext.data.batch.Batch of size 9000]
	[.label]:[torch.LongTensor of size 9000]
	[.sentence]:[torch.LongTensor of size 159x9000]


In [32]:
type(batch)

torchtext.data.batch.Batch

In [33]:
# Currently, the iterator returns a custom datatype
# called torchtext.data.Batch.
# we’ll convert the batch to a tuple in the form
# (x, y) where x is the label tensor
# and y is the sentence

class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            y = getattr(batch, self.y_var) # we assume only one input in this wrapper

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [34]:
train_dl = BatchWrapper(train_iter, "label", "sentence")

In [35]:
next(train_dl.__iter__())

(tensor([2368, 2819, 6461,  ...,  286, 3566, 8939]),
 tensor([[  17,    3,    2,  ...,   17,   95, 1881],
         [ 679,    7,  211,  ...,    2, 1978,   11],
         [3238,    4,    6,  ...,   43,  306,  847],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]))

In [36]:
type(next(train_dl.__iter__()))

tuple

In [37]:
type(next(train_dl.__iter__())[0])

torch.Tensor

In [38]:
type(next(train_dl.__iter__())[1])

torch.Tensor

In [39]:
next(train_dl.__iter__())[0].shape

torch.Size([9000])

In [40]:
next(train_dl.__iter__())[1].shape

torch.Size([219, 9000])

In [None]:
# We could not convert these to numpy arrays
# for the Matching Networks code so we  don't
# have to change everything that was for numpy arrays
# to PyTorch tensors. We could could then convert
# back to Tensors when we need them