In [1]:
# Important, 

# The aim of this code is to convert the CSV file of
# labels = missing words, and sentences with missing words
# into a tensor of numbersthat can be passed through
# the matching networks code just like the numpy array
# used for the images in the original code based on the
# Onmiglot dataset

# The numbers in the tensor will be the numbers each word
# refers to in the vocabulary we are building
# We will not embed at this stage because this is
# is done inside the matching network. We are, in effect,
# not completing the TorchText proprocessing

# This code is a mainly a mixture of two tutorials:
# http://anie.me/On-Torchtext/
# https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

# Comments are a mixture of those from the tutorials (most of them)
# and my own

# Note to self, use Conda environment PyTorch1

In [2]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torchtext import data
from torchtext.data import Iterator
import spacy

In [3]:
# Check what the data looks like

pd.read_csv("data/train_experiments2.csv").head(20)

Unnamed: 0,tackles,tadman had seven solo <blank_token> three assisted tackles and recorded a defensive touchdown after recovering chris johnson 's fumble late in the fourth quarter
0,tackles,defensive line coach bo davis resigned his pos...
1,tackles,N and N < unk > were linebackers paul nelson a...
2,tackles,courtney < unk > was named the sec defensive p...
3,tackles,williams also recorded eight solo <blank_token...
4,tackles,behind defensive mvp burt miami had one player...
5,tackles,in addition the ecu defense ranked eleventh na...
6,tackles,burt the other mvp accumulated nine <blank_tok...
7,tackles,the other interception came from defensive bac...
8,tackles,brown finished the regular season with N <blan...
9,amended,the constitution has since been <blank_token> ...


In [4]:
# Use spacy to define a function to 
# tokenize, or split up, into individual words
# the labels and sentences Note the labels are already
# individual words

spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

# We first define a Field, this is a class that contains
# information on how you want the data preprocessed. It acts
# like an instruction manual that data.TabularDataset will use.
# We define two fields, one for the sentencesm and one for the
# labels

TEXT = data.Field(sequential=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, is_target=True)

In [5]:
# The fields know what to do when given raw data.
# Now, we need to tell the fields what data it
# should work on. This is where we use Datasets.

# The splits method creates a dataset for the train
# and test data by applying the same processing.

train, test = data.TabularDataset.splits(
        path='data/', train='train_experiments2.csv', test='test_experiments2.csv', format='csv',
        fields=[('label', LABEL), ('sentence', TEXT)])

In [6]:
type(train)

torchtext.data.dataset.TabularDataset

In [16]:
# Torchtext handles mapping words to integers, but
# it has to be told the full range of words it should
# handle.

# We are currently building the vocab from the train
# and test data

TEXT.build_vocab(train, test)
LABEL.build_vocab(train, test)

# This makes torchtext go through all the elements in the
# training set, check the contents corresponding to the TEXT
# field, and register the words in its vocabulary. Torchtext
# has its own class called Vocab for handling the vocabulary.
# The Vocab class holds a mapping from word to id in its stoi
# attribute and a reverse mapping in its itos attribute.

In [17]:
vocab = LABEL.vocab
print(vocab.stoi)

defaultdict(<function _default_unk_index at 0x000001F497E5BEA0>, {'<unk>': 0, 'amended': 1, 'tackles': 2, 'borgnine': 3, 'sir': 4})


In [9]:
# https://stackoverflow.com/questions/5844672/delete-an-element-from-a-dictionary

{i:vocab.stoi[i] for i in vocab.stoi if i!='label'}

{'<unk>': 0, 'amended': 1, 'tackles': 2, 'borgnine': 3, 'sir': 4}

In [18]:
vocab = TEXT.vocab
print(vocab.stoi)

defaultdict(<function _default_unk_index at 0x000001F497E5BEA0>, {'<unk>': 0, '<pad>': 1, '<': 2, '>': 3, 'the': 4, 'unk': 5, 'blank_token': 6, 'and': 7, 'to': 8, 'N': 9, 'a': 10, 'in': 11, 'of': 12, 'was': 13, 'for': 14, 'by': 15, 'with': 16, 'defensive': 17, 'seven': 18, 'after': 19, 'had': 20, 'his': 21, 'who': 22, 'is': 23, 'player': 24, 'principal': 25, 'that': 26, 'were': 27, 'while': 28, "'s": 29, 'assisted': 30, 'but': 31, 'constitution': 32, 'four': 33, 'fumble': 34, 'gold': 35, 'he': 36, 'james': 37, 'one': 38, 'other': 39, 'solo': 40, 'they': 41, 'three': 42, 'up': 43, 'would': 44, 'also': 45, 'an': 46, 'as': 47, 'be': 48, 'been': 49, 'between': 50, 'burt': 51, 'coach': 52, 'coin': 53, 'dollar': 54, 'egypt': 55, 'eight': 56, 'end': 57, 'from': 58, 'gallipoli': 59, 'great': 60, 'guitar': 61, 'has': 62, 'henry': 63, 'him': 64, 'however': 65, 'i.': 66, 'it': 67, 'italy': 68, 'john': 69, 'johnson': 70, 'legislation': 71, 'loss': 72, 'macedonia': 73, 'mvp': 74, 'named': 75, 'off'

In [255]:
{i:vocab.stoi[i] for i in vocab.stoi if i!='sentence'}

{'<unk>': 0,
 '<pad>': 1,
 '<': 2,
 '>': 3,
 'the': 4,
 'unk': 5,
 'and': 6,
 'blank_token': 7,
 'to': 8,
 'a': 9,
 'N': 10,
 'was': 11,
 'in': 12,
 'of': 13,
 'for': 14,
 'by': 15,
 'after': 16,
 'his': 17,
 'defensive': 18,
 'is': 19,
 'principal': 20,
 'that': 21,
 'who': 22,
 'with': 23,
 "'s": 24,
 'assisted': 25,
 'but': 26,
 'fumble': 27,
 'gold': 28,
 'had': 29,
 'he': 30,
 'james': 31,
 'seven': 32,
 'they': 33,
 'were': 34,
 'while': 35,
 'an': 36,
 'between': 37,
 'coach': 38,
 'coin': 39,
 'dollar': 40,
 'egypt': 41,
 'end': 42,
 'gallipoli': 43,
 'great': 44,
 'guitar': 45,
 'henry': 46,
 'i.': 47,
 'italy': 48,
 'john': 49,
 'macedonia': 50,
 'named': 51,
 'off': 52,
 'palestine': 53,
 'player': 54,
 'position': 55,
 'recorded': 56,
 'robert': 57,
 'sir': 59,
 'smith': 60,
 'solo': 61,
 'thomas': 62,
 'total': 63,
 'up': 64,
 'would': 65,
 '  ': 66,
 '$': 67,
 '13th': 68,
 'abandoned': 69,
 'add': 70,
 'also': 71,
 'anglicus': 72,
 'annular': 73,
 'another': 74,
 'appeara

In [25]:
# In torchvision and PyTorch, the processing and batching of
# data is handled by DataLoaders. For some reason, torchtext
# has renamed the objects that do the exact same thing to
# Iterators. The basic functionality is the same

train_iter, test_iter = Iterator.splits(
        (train, test),
    
        # (91270, 10153) means 91270 for train and 10153 for test,
        # the number of examples in each
        # That is, we only want to create one "batch" for each
        # as we are only doing this process in TorchText to convert
        # our data into a PyTorch tensor object to be passed around
        # the matching networks program in the same way the
        # vision data was passed around in a numpy array
        # The matching networks program already takes care
        # of batching and we don;t want to distrub things too much
    
        batch_sizes=(10,10),sort_key=None, device=None, batch_size_fn=None, repeat=False, shuffle=None, sort=None, sort_within_batch=None)

# train_iter, test_iter = Iterator(dataset=train, batch_size = 10)

In [26]:
# Look at the batch

# batch = next(train_iter.__iter__()); batch

In [27]:
type(train_iter.__iter__())

generator

In [28]:
# Currently, the iterator returns a custom datatype
# called torchtext.data.Batch.
# we’ll convert the batch to a tuple in the form
# (x, y) where x is the label tensor
# and y is the sentence

class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        
        self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            y = getattr(batch, self.y_var) # we assume only one input in this wrapper

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [29]:
train_dl = BatchWrapper(train_iter, "label", "sentence")
test_dl = BatchWrapper(test_iter, "label", "sentence")

In [30]:
X = next(train_dl.__iter__())
print(X)

(tensor([2, 2, 1, 2, 1, 1, 1, 2, 1, 1]), tensor([[ 17,   9, 129, 365,  19,  97, 242,  51, 345,   4],
        [237,   7, 351,  45, 354,   8,   2,   4,  13,  32],
        [ 52,   9,  65,  80,  29,   4,   6,  39,   2,  62],
        [126,   2,  27,  56, 153, 220,   3,  74,   6,  84],
        [152,   5,   2,  40,  11,   4,  21,  98,   3,  49],
        [304,   3,   6,   2,   9, 174,  71, 260,   8,   2],
        [ 21,  27,   3,   6,   4,  71,   8,   2,  10,   6],
        [ 79, 238,   8,   3,  32,  13, 289,   6, 140,   3],
        [  8, 274, 164,   4,  13, 332,  14,   3, 110, 318],
        [317, 258,   4,  83,   2,   8,  10,   4,  16,  89],
        [ 47,   7, 293, 208,   6, 170, 162,  83,  10,   1],
        [  4,   2, 286,  90,   3,   4, 165, 249,  18,   1],
        [ 17,   5,   8,  14,   7, 267,  94,  12, 185,   1],
        [  2,   3, 169,   4, 251,  12,   9, 106, 158,   1],
        [  6,   2,   4, 278,  12,   4,  35,  24,  11,   1],
        [  3,   5, 175,   7,   4, 196,  53,  11,   9,   1],

In [31]:
X[0]

tensor([2, 2, 1, 2, 1, 1, 1, 2, 1, 1])

In [32]:
Y_0 = X[0].numpy()

In [33]:
Y_1 = X[1].numpy()

In [34]:
type(Y_0)

numpy.ndarray

In [35]:
p = Y_0.argsort()
Y_0p = Y_0[p]

In [36]:
Y_1p = Y_1[:,p]
print(Y_1p)

[[129  19  97 242 345   4  17   9 365  51]
 [351 354   8   2  13  32 237   7  45   4]
 [ 65  29   4   6   2  62  52   9  80  39]
 [ 27 153 220   3   6  84 126   2  56  74]
 [  2  11   4  21   3  49 152   5  40  98]
 [  6   9 174  71   8   2 304   3   2 260]
 [  3   4  71   8  10   6  21  27   6   2]
 [  8  32  13 289 140   3  79 238   3   6]
 [164  13 332  14 110 318   8 274   4   3]
 [  4   2   8  10  16  89 317 258  83   4]
 [293   6 170 162  10   1  47   7 208  83]
 [286   3   4 165  18   1   4   2  90 249]
 [  8   7 267  94 185   1  17   5  14  12]
 [169 251  12   9 158   1   2   3   4 106]
 [  4  12   4  35  11   1   6   2 278  24]
 [175   4 196  53   9   1   3   5   7  11]
 [202 284   7   7   7   1  52   3  10   4]
 [ 12 282  44 371   4   1  14  22 187 195]
 [230  27  48   8 361   1 343  20  34 215]
 [308 349   2 273  12   1   1   9   1  18]
 [ 15   8   6  22   4   1   1   7   1   2]
 [  4   4   3 302 319   1   1   9   1   5]
 [352 270  77 329  13   1   1   2   1   3]
 [  1   7 2

In [37]:
Yp = (Y_0p, Y_1p)
print(Yp)

(array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([[129,  19,  97, 242, 345,   4,  17,   9, 365,  51],
       [351, 354,   8,   2,  13,  32, 237,   7,  45,   4],
       [ 65,  29,   4,   6,   2,  62,  52,   9,  80,  39],
       [ 27, 153, 220,   3,   6,  84, 126,   2,  56,  74],
       [  2,  11,   4,  21,   3,  49, 152,   5,  40,  98],
       [  6,   9, 174,  71,   8,   2, 304,   3,   2, 260],
       [  3,   4,  71,   8,  10,   6,  21,  27,   6,   2],
       [  8,  32,  13, 289, 140,   3,  79, 238,   3,   6],
       [164,  13, 332,  14, 110, 318,   8, 274,   4,   3],
       [  4,   2,   8,  10,  16,  89, 317, 258,  83,   4],
       [293,   6, 170, 162,  10,   1,  47,   7, 208,  83],
       [286,   3,   4, 165,  18,   1,   4,   2,  90, 249],
       [  8,   7, 267,  94, 185,   1,  17,   5,  14,  12],
       [169, 251,  12,   9, 158,   1,   2,   3,   4, 106],
       [  4,  12,   4,  35,  11,   1,   6,   2, 278,  24],
       [175,   4, 196,  53,   9,   1,   3,   5,   7,  11],
   

In [246]:
Y = (Y_0, Y_1)
print(Y)

(array([1, 1, 4, 4, 1, 4, 1, 4, 4, 1], dtype=int64), array([[275,  16, 255, 106, 260, 278, 183,  18,  10,  30],
       [  4, 268,  29,   2,  11,  71,   2, 178,   6, 190],
       [244,  24,  32,   5,   2,  56,   7,  38,  10,   8],
       [228, 112,  61,   3,   7, 122,   3,  91,   2,  70],
       [ 12,  12,   2,  11,   3,  61,  17, 111,   5,   9],
       [ 10,  10,   7,  51,   8,   2, 176, 225,   3, 242],
       [ 85,   4,   3,   4,   9,   7,   8,  17,  34, 188],
       [ 34, 102, 261, 232,  99,   3, 216,  55, 179,   8],
       [198,  11,  25,  18,  78,   4,  14,   8, 203, 135],
       [ 13,   2, 254,  54,  23, 233,   9, 237, 192,   9],
       [212,   7,   6,  13,   9, 152, 118,  80,   6, 214],
       [  6,   3,  56,   4,  32,  63, 120,   4,   2,  90],
       [  4,   6,   9, 272, 133,  14,  67,  18,   5,  26],
       [  2, 187,  18,  16, 115,   4,  10,   2,   3, 195],
       [  7,  13, 263, 182,  12, 207,  28,   7,   2,  11],
       [  3,   4,  16,  32,  10,   6,  39,   3,   5, 251],
   

In [181]:
type(next(train_dl.__iter__()))

tuple

In [182]:
type(next(train_dl.__iter__())[0])

torch.Tensor

In [183]:
print(next(train_dl.__iter__())[0][0:10])

tensor([1, 1, 2, 1, 1, 1, 1, 1, 1, 1])


In [184]:
type(next(train_dl.__iter__())[1])

torch.Tensor

In [185]:
print(next(train_dl.__iter__())[1])

tensor([[ 49,  98,  52, 101,  11,  22,   4,   8,  10, 111,  44],
        [ 58,   1,   2,  16,  39,   4,  30,  71,   6,  21,   8],
        [  4,   1,   7,  12,   4,  30,  68,  23,  10,  31,  27],
        [ 93,   1,   3,  18,  55,  27,  50,  46,   2,  24,  22],
        [ 96,   1, 108,   2,  54,  38,  61,  53,   7,  18,  75],
        [ 13,   1,  78,   5,  89,  81,   8,  94,   3,   2,  16],
        [ 10,   1,   4,   3,  56,   2,  43,  66, 110,   5,  29],
        [  2,   1,  97,  19,  79,   5,   2,  87,  72,   3,  17],
        [  5,   1,   8,  42,  11,   3,   7, 104,  85,   4,  13],
        [  3,   1,  17,  33,   2,   4,   3,  99,  80,  32,  24],
        [ 48,   1,  28,   6,   5,  32,  76,  41,   6,  64,   2],
        [107,   1,   4,  31,   3,  77,  36,   4,   2,  34,   5],
        [ 25,   1, 109,  14,   9,  28,  21,   8,   7,   9,   3],
        [ 84,   1,  20,   8,  26,  40,  47,   2,   3,   4,   6],
        [  6,   1,  74, 105,   1,  17,  19,   5,   2,  86,  35],
        [ 90,   1,  12,  

In [186]:
next(train_dl.__iter__())[0].shape

torch.Size([11])

In [187]:
next(train_dl.__iter__())[1].shape

torch.Size([33, 11])

In [47]:
# Transpose

Y_train = next(train_dl.__iter__())[0]
X_train = next(train_dl.__iter__())[1]
Y_test = next(test_dl.__iter__())[0]
X_test = next(test_dl.__iter__())[1]


X_train = X_train.t()
Y_train = Y_train.t()
X_test = X_test.t()
Y_test = Y_test.t()

RuntimeError: t() expects a 2D tensor, but self is 1D

In [5]:
# Reshape

# We convert the tensors to numpy arrays
# for the Matching Networks code so we  don't
# have to change everything that was for numpy arrays
# to PyTorch tensors. We could could then convert
# back to Tensors when we need them

# Cell incomplete, issue with size of train. Should be 90,000, not 9,000 long

X_train = X_train.reshape()
Y_train = Y_train.reshape()
X_test = X_test.reshape()
Y_test = Y_test.reshape()


In [None]:
# Save to numpy array

np.save('X_train.npy', X_train)
np.save('Y_train.npy', Y_train)
np.save('X_test.npy', X_test)
np.save('Y_test.npy', Y_test)

tensor([[0, 1, 2],
        [3, 4, 5]])


tensor([[0, 3],
        [1, 4],
        [2, 5]])