In [None]:
# Important, 

# The aim of this code is to convert the CSV file of
# labels = missing words, and sentences with missing words
# into a tensor of numbersthat can be passed through
# the matching networks code just like the numpy array
# used for the images in the original code based on the
# Onmiglot dataset

# The numbers in the tensor will be the numbers each word
# refers to in the vocabulary we are building
# We will not embed at this stage because this is
# is done inside the matching network. We are, in effect,
# not completing the TorchText proprocessing

# This code is a mainly a mixture of two tutorials:
# http://anie.me/On-Torchtext/
# https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

# Comments are a mixture of those from the tutorials (most of them)
# and my own

# Note to self, use Conda environment PyTorch1

In [None]:
import pandas as pd
import numpy as np
import torch
import torchtext
from torchtext import data
from torchtext.data import Iterator
import spacy

In [None]:
# Check what the data looks like

pd.read_csv("data/train2.csv").head(20)

In [None]:
# Use spacy to define a function to 
# tokenize, or split up, into individual words
# the labels and sentences Note the labels are already
# individual words

spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

# We first define a Field, this is a class that contains
# information on how you want the data preprocessed. It acts
# like an instruction manual that data.TabularDataset will use.
# We define two fields, one for the sentencesm and one for the
# labels

TEXT = data.Field(sequential=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, is_target=True)

In [None]:
# The fields know what to do when given raw data.
# Now, we need to tell the fields what data it
# should work on. This is where we use Datasets.

# The splits method creates a dataset for the train
# and test data by applying the same processing.

train, test = data.TabularDataset.splits(
        path='data/', train='train2.csv', test='test2.csv', format='csv',
        fields=[('label', LABEL), ('sentence', TEXT)])

In [None]:
type(train)

In [None]:
# Torchtext handles mapping words to integers, but
# it has to be told the full range of words it should
# handle.

# We are currently building the vocab from the train
# and test data

TEXT.build_vocab(train, test)
LABEL.build_vocab(train, test)

# This makes torchtext go through all the elements in the
# training set, check the contents corresponding to the TEXT
# field, and register the words in its vocabulary. Torchtext
# has its own class called Vocab for handling the vocabulary.
# The Vocab class holds a mapping from word to id in its stoi
# attribute and a reverse mapping in its itos attribute.

In [None]:
# vocab = LABEL.vocab
# print(vocab.stoi)

In [None]:
# vocab = TEXT.vocab
# print(vocab.stoi)

In [None]:
# In torchvision and PyTorch, the processing and batching of
# data is handled by DataLoaders. For some reason, torchtext
# has renamed the objects that do the exact same thing to
# Iterators. The basic functionality is the same

train_iter, test_iter = Iterator.splits(
        (train, test),
    
        # (91270, 10153) means 91270 for train and 10153 for test,
        # the number of examples in each
        # That is, we only want to create one "batch" for each
        # as we are only doing this process in TorchText to convert
        # our data into a PyTorch tensor object to be passed around
        # the matching networks program in the same way the
        # vision data was passed around in a numpy array
        # The matching networks program already takes care
        # of batching and we don;t want to distrub things too much
    
        batch_sizes=(90000,10000))

# train_iter, test_iter = Iterator(dataset=train, batch_size = 10)

In [None]:
# Look at the batch

# batch = next(train_iter.__iter__()); batch

In [None]:
type(train_iter.__iter__())

In [None]:
# Currently, the iterator returns a custom datatype
# called torchtext.data.Batch.
# we’ll convert the batch to a tuple in the form
# (x, y) where x is the label tensor
# and y is the sentence

class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        
        self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            y = getattr(batch, self.y_var) # we assume only one input in this wrapper

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [None]:
train_dl = BatchWrapper(train_iter, "label", "sentence")
#test_dl = BatchWrapper(test_iter, "label", "sentence")

In [None]:
X_train = next(train_dl.__iter__())
#X_test = next(test_dl.__iter__())

In [None]:
X_train_labels = X_train[0].numpy()
X_train_sentences = X_train[1].numpy()
p = X_train_labels.argsort()
X_train_labels = X_train_labels[p]
X_train_sentences = X_train_sentences[:,p]
print(X_train_sentences.shape)
print(X_train_labels.shape)

# X_test_labels = X_test[0].numpy()
# X_test_sentences = X_test[1].numpy()
# q = X_test_labels.argsort()
# X_test_labels = X_test_labels[q]
# X_test_sentences = X_test_sentences[:,q]
# print(X_test_sentences.shape)
# print(X_test_labels.shape)

In [None]:
X_train_sentences = X_train_sentences.transpose()
print(X_train_sentences.shape)

In [None]:
X_train_sentences = np.reshape(X_train_sentences, (10,9000,219))
print(X_train_sentences.shape)

In [None]:
X_train_sentences = X_train_sentences.transpose((1, 0, 2))
print(X_train_sentences.shape)

In [None]:
X_train_labels = np.reshape(X_train_labels, (10,9000))
print(X_train_labels.shape)

In [None]:
X_train_labels = X_train_labels.transpose()
print(X_train_labels.shape)

In [None]:
np.save('X_train_sentences.npy', X_train_sentences)
np.save('X_train_labels.npy', X_train_labels)

# np.save('X_test_sentences.npy', X_test_sentences)
# np.save('X_test_labels.npy', X_test_labels)

In [None]:
# Some experiments

In [None]:
# Reshape

# We convert the tensors to numpy arrays
# for the Matching Networks code so we  don't
# have to change everything that was for numpy arrays
# to PyTorch tensors. We could could then convert
# back to Tensors when we need them

# Cell incomplete, issue with size of train. Should be 90,000, not 9,000 long

X_train = X_train.reshape()
Y_train = Y_train.reshape()
X_test = X_test.reshape()
Y_test = Y_test.reshape()

In [None]:
a = np.array([[1,2,3,4,5,6], [7,8,9,10,11,12], [13,14,15,16,17,18], [19,20,21,22,23,24], 
              [25,26,27,28,29,30], [31,32,33,34,35,36], [37,38,39,40,41,42], [43,44,45,46,47,48]])
print(a)

In [None]:
a = np.reshape(a, (2,4,6))
print(a)