Before we start doing anything, I think it's important to understand for NLP, this is the intuitive process on what we are trying to do when we are processing our data in the IMDB dataset:
1. Tokenization: break sentence into individual words
    - Before: `"PyTorch seems really easy to use!"`
    - After: `["PyTorch", "seems", "really", "easy", "to", "use", "!"]`
2. Building vocabulary: build an index of words associated with unique numbers
    - Before: `["PyTorch", "seems", "really", "easy", "to", "use", "!"]`
    - After: `{"Pytorch: 0, "seems": 1, "really": 2, ...}`
3. Convert to numerals: map words to unique numbers (indices)
    - Before: `{"Pytorch: 0, "seems": 1, "really": 2, ...}`
    - After: `[0, 1, 2, ...]`
4. Embedding look-up: map sentences (indices now) to fixed matrices
    - ```[[0.1, 0.4, 0.3],
       [0.8, 0.1, 0.5],
       ...]```

In [1]:
# Critical imports
import matplotlib.pyplot as plt
import torch
%matplotlib inline

In [2]:
# Set seed
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(1337)

In [3]:
# Set plotting style
plt.style.use(('dark_background', 'bmh'))
plt.rc('axes', facecolor='none')
plt.rc('figure', figsize=(16, 4))

In [4]:
from torchtext import data
import torch

# Create instances of fields
# The important field here is fix_length: all examples using this field will be padded to, or None for flexible sequence lengths
# We are fixing this because we will be using a FNN not an LSTM/RNN/GRU where we can go through uneven sequence lengths
text = data.Field(sequential=True, fix_length=80, batch_first=True, lower=True)
label = data.LabelField(sequential=False)

In [5]:
# Calling splits() class method of datasets.IMDB to return a torchtext.data.Dataset object
from torchtext import datasets
ds_train, ds_test = datasets.IMDB.splits(text, label)

In [6]:
# Training and test set each 25k samples
# 2 fields due to the way we split above
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

train :  25000
test :  25000
train.fields : {'text': <torchtext.data.field.Field object at 0x7f2e76657748>, 'label': <torchtext.data.field.LabelField object at 0x7f2e766577b8>}


In [7]:
# Get validation set
import random
seed_num = 1337
ds_train, ds_valid = ds_train.split(random_state=random.seed(seed_num))

In [8]:
# Now we've training, validation and test set
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('valid : ', len(ds_test))

train :  17500
valid :  7500
valid :  25000


In [9]:
# Build vocabulary
text.build_vocab(ds_train, max_size=25000)
label.build_vocab(ds_train)

In [10]:
# Print vocab size
print('Vocabulary size: {}'.format(len(text.vocab)))
print('Label size: {}'.format(len(label.vocab)))

Vocabulary size: 25002
Label size: 2


In [11]:
# Print most common vocabulary text
most_common_samples = 10
print(text.vocab.freqs.most_common(most_common_samples))

[('the', 225938), ('a', 112571), ('and', 111513), ('of', 101389), ('to', 94175), ('is', 72550), ('in', 63886), ('i', 49428), ('this', 49096), ('that', 46809)]


In [12]:
# Print most common labels
print(label.vocab.freqs.most_common())

[('neg', 8835), ('pos', 8665)]


In [13]:
# Sample 0 label
ds_train[0].label

'neg'

In [14]:
# Sample 0 text: broken down into individual portions
ds_train[0].text

['first',
 'of',
 'all',
 'i',
 'just',
 'want',
 'to',
 'say',
 'that',
 'i',
 'love',
 'this',
 'show!!!',
 'but',
 'this',
 'episode...this',
 'episode',
 'makes',
 'a',
 'mockery',
 'of',
 'the',
 'entire',
 'show.<br',
 '/><br',
 '/>i',
 "don't",
 'know',
 'what',
 'they',
 'tried',
 'to',
 'achieve',
 'with',
 'this',
 'episode',
 'but',
 'they',
 'successfully',
 'created',
 'the',
 'worst',
 'episode',
 'in',
 'the',
 'entire',
 'series.<br',
 '/><br',
 '/>there',
 'is',
 'no',
 'story',
 'line,',
 'everything',
 'is',
 'chaotic',
 'and',
 'the',
 'jokes.....are',
 'crap.<br',
 '/><br',
 '/>the',
 'way',
 'they',
 'tried',
 'to',
 'answer',
 'some',
 'of',
 'the',
 'remaining',
 'questions',
 'in',
 'the',
 'game.....',
 'for',
 'example',
 '"how',
 'do',
 'the',
 'furlings',
 'look',
 'like"',
 'by',
 'creating',
 'that',
 'stupid',
 '"previously',
 'on..."......is',
 'simply',
 'embarrassing.<br',
 '/><br',
 '/>its',
 'clear',
 'that',
 'the',
 'writers',
 'are',
 'running',


In [15]:
# Sample 0 text: human readeable sample
def show_text(sample):
    print(' '.join(word for word in sample))
    
show_text(ds_train[0].text)

first of all i just want to say that i love this show!!! but this episode...this episode makes a mockery of the entire show.<br /><br />i don't know what they tried to achieve with this episode but they successfully created the worst episode in the entire series.<br /><br />there is no story line, everything is chaotic and the jokes.....are crap.<br /><br />the way they tried to answer some of the remaining questions in the game..... for example "how do the furlings look like" by creating that stupid "previously on..."......is simply embarrassing.<br /><br />its clear that the writers are running out of ideas and that is really too bad.


In [16]:
# Create and iterable object for our training, validation and testing datasets
# Batches examples of similar lengths together that minimizes amount of padding needed
batch_size = 1  # Change batch size from 1 to bigger number once explanation is done
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False
)

In [17]:
# Check if iterator above is an iterable which should show True
import collections
isinstance(train_loader, collections.Iterable)

True

In [18]:
# What's inside this iteratable object? Our text and label although now everything is in machine format (not "words") but in numbers!
# The text we saw above becomes a matrix of size 1 x 80 represented by the fixed length we defined before that
list(train_loader)[0]


[torchtext.data.batch.Batch of size 1]
	[.text]:[torch.LongTensor of size 1x80]
	[.label]:[torch.LongTensor of size 1]

In [19]:
# Alternative to above, this is much faster but the above code is easy to understand and implement
next(train_loader.__iter__())


[torchtext.data.batch.Batch of size 1]
	[.text]:[torch.LongTensor of size 1x80]
	[.label]:[torch.LongTensor of size 1]

In [20]:
test_batch = next(train_loader.__iter__())

In [21]:
# What methods can we call on this batch object? Text and label
test_batch.fields

dict_keys(['text', 'label'])

In [22]:
# Let's break this down to check what's in a batch
test_batch.text

tensor([[     9,    383,      6,      2,     20,     15,      3,   5363,
           5432,      8,      0,     38,    147,     26,     32,    316,
             48,      9,    222,    160,      6,   1827,      2,     76,
              7,     50,  15087,      2,     20,      7,     50,    228,
            605,      9,    252,     12,     92,     26,    652,      6,
           4729,    190,   4233,     91,    901,      6,      2,    473,
             84,      2,    570,      5,     20,     25,     65,     67,
              8,      3,    514,    390,     63,     25,    169,      6,
             74,   1214,     51,    817,  17403,      0,     14,     50,
             56,    412,   1974,  15623,      7,   4195,      2,     20]])

In [23]:
# 1 comment per batch, each comment is limited to a size of 80 as we've defined
test_batch.text.size()

torch.Size([1, 80])

In [24]:
test_batch.label

tensor([ 0])

In [25]:
# Extremely weird problem in torchtext where BucketIterator returns a Batch object versus just a simple tuple of tensors containing our text index and labels
# So let's fix this with a new class FixBatchGenerator

class FixBatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)
            
train_loader, valid_loader, test_loader = FixBatchGenerator(train_loader, 'text', 'label'), FixBatchGenerator(valid_loader, 'text', 'label'), FixBatchGenerator(test_loader, 'text', 'label')

In [26]:
# Text index
print(next(train_loader.__iter__())[0])

# Text label
print(next(train_loader.__iter__())[1])

tensor([[    70,      3,      0,    351,    852,     72,      2,    421,
              5,      2,   5589,    440,   1111,   2790,     44,     23,
            228,    179,      2,    151,    374,    797,      6,     26,
              2,    550,     11,    274,    140,    169,     42,      8,
              2,   2227,     53,   1004,      0,     35,    193,      2,
           2227,     18,     49,      2,    834,      0,     27,    177,
            154,   2335,      7,    274,    243,      6,   1557,     42,
            150,      6,      2,   7791,     11,     53,     14,   1102,
              0,  13586,   3252,      7,      2,   1243,      5,      2,
            440,    374,    243,      6,     74,      2,    151,    374]])
tensor([ 1])


In [27]:
import torch.nn as nn
   
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # Linear function
        self.fc1 = nn.Linear(embedding_dim, hidden_dim) 

        # Non-linearity
        self.sigmoid = nn.Sigmoid()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)
        
        # Linear function  # LINEAR
        out = self.fc1(embedded)

        # Non-linearity  # NON-LINEAR
        out = self.sigmoid(out)

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)
        return out

In [28]:
input_dim = 80
embedding_dim = 16 
hidden_dim = 16
output_dim = 2

# Instantiate model class and assign to object
model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim)

# Push model to CUDA device if available
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
learning_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

In [29]:
# Number of groups of parameters
print(len(list(model.parameters())))

5


In [30]:
# Print parameters
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

torch.Size([80, 16])
torch.Size([16, 16])
torch.Size([16])
torch.Size([2, 16])
torch.Size([2])


In [32]:
iter = 0
num_epochs = 10
for epoch in range(num_epochs):
    for i, (samples, labels) in enumerate(train_loader):
        # Load images with gradient accumulation capabilities
        samples = samples.view(-1, input_dim).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(samples)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for samples, labels in test_loader:
                # Load images with gradient accumulation capabilities
                samples = samples.view(-1, input_dim).requires_grad_()

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

RuntimeError: index out of range at /opt/conda/conda-bld/pytorch_1524580978845/work/aten/src/TH/generic/THTensorMath.c:343