In [1]:
# Important, 

# The aim of this code is to convert the CSV file of
# labels = missing words, and sentences with missing words
# into a tensor of numbersthat can be passed through
# the matching networks code just like the numpy array
# used for the images in the original code based on the
# Onmiglot dataset

# The numbers in the tensor will be the numbers each word
# refers to in the vocabulary we are building
# We will not embed at this stage because this is
# is done inside the matching network. We are, in effect,
# not completing the TorchText proprocessing

# This code is a mainly a mixture of two tutorials:
# http://anie.me/On-Torchtext/
# https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

# Comments are a mixture of those from the tutorials (most of them)
# and my own

# Note to self, use Conda environment PyTorch1

In [2]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torchtext import data
from torchtext.data import Iterator
import spacy

In [5]:
# Check what the data looks like

pd.read_csv("data/train2.csv").head(20)

Unnamed: 0,borgnine,hank azaria who provides the voice of apu commented that <blank_token> had no idea what the hell he was doing
0,borgnine,the unseen person or creature that attacks <bl...
1,borgnine,meanwhile the other junior campers led by er...
2,borgnine,<blank_token> was a guitar player in real life...
3,borgnine,seeing ned flanders get it wrong is great but ...
4,borgnine,rick porter of < unk > N it wrote in that he w...
5,borgnine,meanwhile the other junior campers led by er...
6,borgnine,ernest <blank_token> guest starred in the epis...
7,borgnine,<blank_token> apologized because he felt that ...
8,borgnine,in her book my life as a N year old boy cart...
9,tackles,tadman had seven solo <blank_token> three assi...


In [6]:
# Use spacy to define a function to 
# tokenize, or split up, into individual words
# the labels and sentences Note the labels are already
# individual words

spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

# We first define a Field, this is a class that contains
# information on how you want the data preprocessed. It acts
# like an instruction manual that data.TabularDataset will use.
# We define two fields, one for the sentencesm and one for the
# labels

TEXT = data.Field(sequential=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, is_target=True)

In [7]:
# The fields know what to do when given raw data.
# Now, we need to tell the fields what data it
# should work on. This is where we use Datasets.

# The splits method creates a dataset for the train
# and test data by applying the same processing.

train, test = data.TabularDataset.splits(
        path='data/', train='train2.csv', test='test2.csv', format='csv',
        fields=[('label', LABEL), ('sentence', TEXT)])

In [8]:
type(train)

torchtext.data.dataset.TabularDataset

In [9]:
# Torchtext handles mapping words to integers, but
# it has to be told the full range of words it should
# handle.

# We are currently building the vocab from the train
# and test data

TEXT.build_vocab(train, test)
LABEL.build_vocab(train, test)

# This makes torchtext go through all the elements in the
# training set, check the contents corresponding to the TEXT
# field, and register the words in its vocabulary. Torchtext
# has its own class called Vocab for handling the vocabulary.
# The Vocab class holds a mapping from word to id in its stoi
# attribute and a reverse mapping in its itos attribute.

In [10]:
# vocab = LABEL.vocab
# print(vocab.stoi)

In [11]:
# vocab = TEXT.vocab
# print(vocab.stoi)

In [12]:
# In torchvision and PyTorch, the processing and batching of
# data is handled by DataLoaders. For some reason, torchtext
# has renamed the objects that do the exact same thing to
# Iterators. The basic functionality is the same

train_iter, test_iter = Iterator.splits(
        (train, test),
    
        # (91270, 10153) means 91270 for train and 10153 for test,
        # the number of examples in each
        # That is, we only want to create one "batch" for each
        # as we are only doing this process in TorchText to convert
        # our data into a PyTorch tensor object to be passed around
        # the matching networks program in the same way the
        # vision data was passed around in a numpy array
        # The matching networks program already takes care
        # of batching and we don;t want to distrub things too much
    
        batch_sizes=(90000,10000))

# train_iter, test_iter = Iterator(dataset=train, batch_size = 10)

In [13]:
# Look at the batch

# batch = next(train_iter.__iter__()); batch

In [14]:
type(train_iter.__iter__())

generator

In [15]:
# Currently, the iterator returns a custom datatype
# called torchtext.data.Batch.
# we’ll convert the batch to a tuple in the form
# (x, y) where x is the label tensor
# and y is the sentence

class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        
        self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            y = getattr(batch, self.y_var) # we assume only one input in this wrapper

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [16]:
train_dl = BatchWrapper(train_iter, "label", "sentence")
test_dl = BatchWrapper(test_iter, "label", "sentence")

In [17]:
X_train = next(train_dl.__iter__())
X_test = next(test_dl.__iter__())

TypeError: '<' not supported between instances of 'Example' and 'Example'

In [None]:
Y_0 = X_train[0].numpy()
Y_1 = X_train[1].numpy()
p = Y_0.argsort()
Y_0p = Y_0[p]
Y_1p = Y_1[:,p]
Yp = (Y_0p, Y_1p)
print(Yp.shape)

Z_0 = X_train[0].numpy()
Z_1 = X_train[1].numpy()
q = Z_0.argsort()
Z_0q = Z_0[q]
Z_1q = Z_1[:,q]
Zp = (Z_0q, Z_1q)
print(Zq.shape)

In [None]:
np.save('Yp.npy', Yp)
np.save('Zq.npy', Zq)

In [185]:
print(next(train_dl.__iter__())[1])

tensor([[ 49,  98,  52, 101,  11,  22,   4,   8,  10, 111,  44],
        [ 58,   1,   2,  16,  39,   4,  30,  71,   6,  21,   8],
        [  4,   1,   7,  12,   4,  30,  68,  23,  10,  31,  27],
        [ 93,   1,   3,  18,  55,  27,  50,  46,   2,  24,  22],
        [ 96,   1, 108,   2,  54,  38,  61,  53,   7,  18,  75],
        [ 13,   1,  78,   5,  89,  81,   8,  94,   3,   2,  16],
        [ 10,   1,   4,   3,  56,   2,  43,  66, 110,   5,  29],
        [  2,   1,  97,  19,  79,   5,   2,  87,  72,   3,  17],
        [  5,   1,   8,  42,  11,   3,   7, 104,  85,   4,  13],
        [  3,   1,  17,  33,   2,   4,   3,  99,  80,  32,  24],
        [ 48,   1,  28,   6,   5,  32,  76,  41,   6,  64,   2],
        [107,   1,   4,  31,   3,  77,  36,   4,   2,  34,   5],
        [ 25,   1, 109,  14,   9,  28,  21,   8,   7,   9,   3],
        [ 84,   1,  20,   8,  26,  40,  47,   2,   3,   4,   6],
        [  6,   1,  74, 105,   1,  17,  19,   5,   2,  86,  35],
        [ 90,   1,  12,  

In [186]:
next(train_dl.__iter__())[0].shape

torch.Size([11])

In [187]:
next(train_dl.__iter__())[1].shape

torch.Size([33, 11])

In [47]:
# Transpose

Y_train = next(train_dl.__iter__())[0]
X_train = next(train_dl.__iter__())[1]
Y_test = next(test_dl.__iter__())[0]
X_test = next(test_dl.__iter__())[1]


X_train = X_train.t()
Y_train = Y_train.t()
X_test = X_test.t()
Y_test = Y_test.t()

RuntimeError: t() expects a 2D tensor, but self is 1D

In [5]:
# Reshape

# We convert the tensors to numpy arrays
# for the Matching Networks code so we  don't
# have to change everything that was for numpy arrays
# to PyTorch tensors. We could could then convert
# back to Tensors when we need them

# Cell incomplete, issue with size of train. Should be 90,000, not 9,000 long

X_train = X_train.reshape()
Y_train = Y_train.reshape()
X_test = X_test.reshape()
Y_test = Y_test.reshape()


In [None]:
# Save to numpy array

np.save('X_train.npy', X_train)
np.save('Y_train.npy', Y_train)
np.save('X_test.npy', X_test)
np.save('Y_test.npy', Y_test)

tensor([[0, 1, 2],
        [3, 4, 5]])


tensor([[0, 3],
        [1, 4],
        [2, 5]])