In [1]:
import re
import numpy as np
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn

# Deep Learning for NLP - lab exercise 1

In this first lab exercise we will implement a simple bag-of-word
classifier, i.e. a classifier that ignores the sequential structure of
the sentence, and a classifier based on a convolutional neural network
(CNN). The goal is to predict if a sentence is a positive or negative
review of a movie. We will use a dataset constructed from IMDB.

1.  Load and clean the data
2.  Preprocess the data for the NN
3.  Module definition
4.  Train the network!

We will implement this model with Pytorch, the most popular deep
learning framework for Natural Language Processing. You can use the
following links for help:

-   turorials: <http://pytorch.org/tutorials/>
-   documentation: <http://pytorch.org/docs/master/>

## Data

The data can be download here: <http://caio-corro.fr/dl4nlp/imdb.zip>

There are two files: one with positive reviews (imdb.pos) and one with
negative reviews (imdb.neg). Each file contains 300000 reviews, one per
line.

The following functions can be used to load and clean the data.



In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    dataset=[]
    with open(filename) as f:
        line = f.readline()
        cpt=1
        skip=0
        while line :
            cleanline = clean_str(f.readline()).split()
            if cleanline: 
                dataset.append(cleanline)
            else: 
                line = f.readline()
                skip+=1
                continue
            if limit > 0 and cpt >= limit: 
                break
            line = f.readline()
            cpt+=1        

        print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset

The following cell load the first 5000 sentences in each review set.

In [4]:
LIM = 5000
txtfile = "./imdb/imdb.pos"
postxt = loadTexts(txtfile,limit=LIM)

txtfile = "./imdb/imdb.neg"
negtxt = loadTexts(txtfile,limit=LIM)

Load  5000  lines from  ./imdb/imdb.pos  /  1  lines discarded
Load  5000  lines from  ./imdb/imdb.neg  /  1  lines discarded


Split the data between train / dev / test, for example by creating lists
txt_train, label_train, txt_dev, ... You should take care to keep a
50/50 ratio between positive and negative instances in each set.

In [7]:
# split into train / dev / test
train_pos_indices = np.random.choice(len(postxt), size=int(0.6*LIM), replace=False)
# create dev excluding train
dev_pos_indices = np.random.choice(list(set(range(len(postxt))) - set(train_pos_indices)), size=int(0.2*LIM), replace=False)
# create test excluding train and dev
test_pos_indices = list(set(range(len(postxt))) - set(train_pos_indices) - set(dev_pos_indices))

train_neg_indices = np.random.choice(len(negtxt), size=int(0.6*LIM), replace=False)
# create dev excluding train
dev_neg_indices = np.random.choice(list(set(range(len(negtxt))) - set(train_neg_indices)), size=int(0.2*LIM), replace=False)
# create test excluding train and dev
test_neg_indices = list(set(range(len(negtxt))) - set(train_neg_indices) - set(dev_neg_indices))

train_pos = [postxt[i] for i in train_pos_indices]
dev_pos = [postxt[i] for i in dev_pos_indices]
test_pos = [postxt[i] for i in test_pos_indices]

train_neg = [negtxt[i] for i in train_neg_indices]
dev_neg = [negtxt[i] for i in dev_neg_indices]
test_neg = [negtxt[i] for i in test_neg_indices]

# create train / dev / test sets
train = [(x,1) for x in train_pos] + [(x,0) for x in train_neg]
dev = [(x,1) for x in dev_pos] + [(x,0) for x in dev_neg]
test = [(x,1) for x in test_pos] + [(x,0) for x in test_neg]

# Converting data to Pytorch tensors

We will first convert data to Pytorch tensors so they can be used in a
neural network. To do that, you must first create a dictionnary that
will map words to integers. Add to the dictionnary only words that are
in the training set (be sure to understand why we do that!).

Then, you can convert the data to tensors:

-   use tensors of longs: both the sentence and the label will be
    represented as integers, not floats!
-   these tensors do not require a gradient

A tensor representing a sentence is composed of the integer
representation of each word, e.g. \[10, 256, 3, 4\]. Note that some
words in the dev and test sets may not be in the dictionnary! (i.e.
unknown words) You can just skip them, even if this is a bad idea in
general.

In [124]:
# make a dictionary of all words in the training set
word_dict = {}
for sent, _ in train:
    for word in sent:
        if word not in word_dict:
            word_dict[word] = len(word_dict)

def sent2tensor(sent, word_dict):
    # convert sentence to list of indices, if a word is not in the dictionary, skip it
    idxs = [word_dict[word] if word in word_dict else -1 for word in sent]
    # remove words not in dictionary
    idxs = [idx for idx in idxs if idx >= 0]
    if idxs == []:
        return None
    return th.LongTensor(idxs)

train_data = [(sent2tensor(sent, word_dict), label) for sent, label in train]
dev_data = [(sent2tensor(sent, word_dict), label) for sent, label in dev]
test_data = [(sent2tensor(sent, word_dict), label) for sent, label in test]

# remove empty sentences
train_data = [x for x in train_data if x[0] is not None]
dev_data = [x for x in dev_data if x[0] is not None]
test_data = [x for x in test_data if x[0] is not None]

# Neural network definition

You need to implement two networks:

-   a simple bag of word model (note: it may be better to take the mean
    of input embeddings that the sum)
-   a simple CNN as described in the course

To simplify code, you can assume the input will always be a single
sentence first, and then implement batched inputs. In the case of
batched inputs, give to the forward function a (python) list of tensors.

The bag of word neural network should be defined as follows:

-   take as input a tensor that is a sequence of integers indexing word
    embeddings
-   retrieve the word embeddings from an embedding table
-   construct the "input" of the MLP by summing (or computing the mean)
    over all embeddingsÂ (i.e. bag-of-word model)
-   build a hidden represention using a MLP (1 layer? 2 layers?
    experiment! but maybe first try wihout any hidden layer...)
-   project the hidden representation to the output space: it is a
    binary classification task, so the output space is a scalar where a
    negative (resp. positive) value means the review is negative (resp.
    positive).

The CNN is a little bit more tricky to implement. The goal is that you
implement the one presented in the first lecture. Importantly, you
should add "padding" tokens before and after the sentence so you can
have a convolution even when there is a single word in the input. For
example, if you input sentence is \["word"\], you want to instead
consider the sentence \["\<BOS>", "word", "\<EOS>"\] if your window is
of size 2 or 3. You can do this either directly when you load the data,
or you can do that in the neural network module.

In [125]:
# BAG of word classifier
class CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, linear_dim):
        super(CBOW_classifier, self).__init__()
        # create embedding table
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # create linear layer
        if type(linear_dim) == int:
            self.linear == nn.Sequential(nn.Linear(embedding_dim, linear_dim), nn.ReLU(), nn.Linear(linear_dim, 1))
        elif type(linear_dim) in [list, tuple]:
            layers = [nn.Linear(embedding_dim, linear_dim[0]), nn.ReLU()]
            for i in range(len(linear_dim)-1):
                layers.append(nn.Linear(linear_dim[i], linear_dim[i+1]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(linear_dim[-1], 1))
            self.linear = nn.Sequential(*layers)
        else:
            raise ValueError("linear_dim must be an int, list or tuple")
        
        
    def forward(self, inputs):
        # get embeddings
        embeds = self.embedding(inputs)
        # sum embeddings and average
        embeds = th.sum(embeds, dim=0) / embeds.shape[0]
        # linear layer
        out = self.linear(embeds)
        # sigmoid
        out = F.sigmoid(out)
        return out
    
    def get_embeddings(self, inputs):
        # get embeddings
        embeds = self.embedding(inputs)
        # sum embeddings and average
        embeds = th.sum(embeds, dim=0) / embeds.shape[0]
        return embeds

## Loss function

Create a loss function builder.

-   Pytorch loss functions are documented here:
    <https://pytorch.org/docs/stable/nn.html#loss-functions>
-   In our case, we are interested in *BCELoss* and *BCEWithLogitsLoss*.
    Read their documentation and choose the one that fits with your
    network output

In [126]:
# define loss function
loss_fn = nn.BCELoss()

embedding_size = 100
linear_size = (50, 20)
# define model
model = CBOW_classifier(len(word_dict), embedding_size, linear_size)

# define optimizer
optim = th.optim.Adam(model.parameters(), lr=0.001)


## Training loop

Write your training loop!

-   parameterizable number of epochs
-   at each epoch, print the mean loss and the dev accuracy

In [127]:
# training loop
for epoch in range(10):
    # shuffle training data
    np.random.shuffle(train_data)
    # set model to train mode
    model.train()

    #compute train accuracy
    correct = 0

    # loop over training data
    for sent, label in train_data:
        # zero gradients
        optim.zero_grad()
        # forward pass
        out = model(sent)
        # compute loss
        loss = loss_fn(out, th.FloatTensor([label]))
        # backward pass
        loss.backward()
        # update parameters
        optim.step()

        #compute train accuracy
        pred = 1 if out > 0.5 else 0
        if pred == label:
            correct += 1
    # compute accuracy
    train_acc = correct / len(train_data)
    


    # set model to eval mode
    model.eval()
    # compute accuracy on dev set
    correct = 0
    for sent, label in dev_data:
        # forward pass
        out = model(sent)
        # get prediction
        pred = 1 if out > 0.5 else 0
        # check if prediction is correct
        if pred == label:
            correct += 1
    # compute accuracy
    acc = correct / len(dev_data)
    print("Epoch: {}, Train Acc: {}, Dev Acc: {}".format(epoch, train_acc, acc))

Epoch: 0, Train Acc: 0.6845, Dev Acc: 0.7469574036511156
Epoch: 1, Train Acc: 0.8095, Dev Acc: 0.7667342799188641
Epoch: 2, Train Acc: 0.868, Dev Acc: 0.7768762677484787
Epoch: 3, Train Acc: 0.9093333333333333, Dev Acc: 0.7824543610547667
Epoch: 4, Train Acc: 0.9363333333333334, Dev Acc: 0.7672413793103449
Epoch: 5, Train Acc: 0.96, Dev Acc: 0.7885395537525355
Epoch: 6, Train Acc: 0.971, Dev Acc: 0.7799188640973631
Epoch: 7, Train Acc: 0.9808333333333333, Dev Acc: 0.7799188640973631
Epoch: 8, Train Acc: 0.9898333333333333, Dev Acc: 0.7844827586206896
Epoch: 9, Train Acc: 0.9916666666666667, Dev Acc: 0.7789046653144016


In [128]:
#compute distance between two sentences in the embedding space of model
def compute_distance(sent1, sent2, model, word_dict):
    # get embeddings
    embeds1 = model.get_embeddings(sent2tensor(sent1, word_dict))
    embeds2 = model.get_embeddings(sent2tensor(sent2, word_dict))
    # compute distance
    return th.dist(embeds1, embeds2)

In [129]:
compute_distance(["good"], ["bad"], model, word_dict)

tensor(16.6578, grad_fn=<DistBackward0>)

In [130]:
compute_distance(["bad"], ["terrible"], model, word_dict)

tensor(12.7767, grad_fn=<DistBackward0>)

In [131]:
compute_distance(["good"], ["amazing"], model, word_dict)

tensor(13.7037, grad_fn=<DistBackward0>)

In [132]:
compute_distance(["this", "movie", "has", 'been', "amazing"], ["movie"], model, word_dict)

tensor(8.5425, grad_fn=<DistBackward0>)

In [133]:
sentence_1 = "this movie makes me hate".split()
sentence_2 = "this movie is literally the best in the world".split()
print(compute_distance(sentence_1, sentence_2, model, word_dict))

tensor(4.9489, grad_fn=<DistBackward0>)


It appears that the embeddings we have learnt are not clustered appropriately in the feature space; synonyms are at similar distances as antonyms. But that is ok, our goal has been to train a linear classifier using word embeddings - which we have reasonable achieved.

Next up: CNNs for the same task

In [134]:
# build a convolutional bag of words classifier
class ConvCBOX(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_size, out_channels, linear_dim):
        super(ConvCBOX, self).__init__()
        # create embedding table
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # create convolutional layer
        self.conv = nn.Conv1d(embedding_dim, out_channels, kernel_size, padding = 'same')
        # create linear layer
        if type(linear_dim) == int:
            self.linear == nn.Sequential(nn.Linear(out_channels, linear_dim), nn.ReLU(), nn.Linear(linear_dim, 1))
        elif type(linear_dim) in [list, tuple]:
            layers = [nn.Linear(out_channels, linear_dim[0]), nn.ReLU()]
            for i in range(len(linear_dim)-1):
                layers.append(nn.Linear(linear_dim[i], linear_dim[i+1]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(linear_dim[-1], 1))
            self.linear = nn.Sequential(*layers)
        else:
            raise ValueError("linear_dim must be an int, list or tuple")
        
        
    def forward(self, inputs):
        # get embeddings
        embeds = self.embedding(inputs)
        # transpose embeddings
        embeds = embeds.transpose(1,2)
        # convolve
        conv_out = self.conv(embeds)
        # max pool
        pool_out = F.max_pool1d(conv_out, conv_out.shape[2])
        # linear layer
        out = self.linear(pool_out.squeeze())
        # sigmoid
        out = F.sigmoid(out)
        return out
    
    def get_embeddings(self, inputs):
        # get embeddings
        embeds = self.embedding(inputs)
        # transpose embeddings
        embeds = embeds.transpose(1,2)
        # convolve
        conv_out = self.conv(embeds)
        # max pool
        pool_out = F.max_pool1d(conv_out, conv_out.shape[2])
        return pool_out.squeeze()

In [135]:
# define model
embedding_size = 100
kernel_size = 3
out_channels = 100
linear_size = (50, 20)
model = ConvCBOX(len(word_dict), embedding_size, kernel_size, out_channels, linear_size)

# define optimizer
optim = th.optim.Adam(model.parameters(), lr=0.001)


In [136]:
# training loop
for epoch in range(10):
    # shuffle training data
    np.random.shuffle(train_data)
    # set model to train mode
    model.train()

    #compute train accuracy
    correct = 0

    # loop over training data
    for sent, label in train_data:
        # zero gradients
        optim.zero_grad()
        # forward pass
        out = model(sent.unsqueeze(0))
        # compute loss
        loss = loss_fn(out, th.FloatTensor([label]))
        # backward pass
        loss.backward()
        # update parameters
        optim.step()

        #compute train accuracy
        pred = 1 if out > 0.5 else 0
        if pred == label:
            correct += 1
    # compute accuracy
    train_acc = correct / len(train_data)
    


    # set model to eval mode
    model.eval()
    # compute accuracy on dev set
    correct = 0
    for sent, label in dev_data:
        # forward pass
        out = model(sent.unsqueeze(0))
        # get prediction
        pred = 1 if out > 0.5 else 0
        # check if prediction is correct
        if pred == label:
            correct += 1
    # compute accuracy
    acc = correct / len(dev_data)
    print("Epoch: {}, Train Acc: {}, Dev Acc: {}".format(epoch, train_acc, acc))

Epoch: 0, Train Acc: 0.6773333333333333, Dev Acc: 0.7565922920892495
Epoch: 1, Train Acc: 0.8165, Dev Acc: 0.7697768762677485
Epoch: 2, Train Acc: 0.8985, Dev Acc: 0.7723123732251521
Epoch: 3, Train Acc: 0.9455, Dev Acc: 0.7743407707910751
Epoch: 4, Train Acc: 0.9673333333333334, Dev Acc: 0.7723123732251521
Epoch: 5, Train Acc: 0.978, Dev Acc: 0.7824543610547667
Epoch: 6, Train Acc: 0.9881666666666666, Dev Acc: 0.7794117647058824
Epoch: 7, Train Acc: 0.9888333333333333, Dev Acc: 0.789553752535497
Epoch: 8, Train Acc: 0.991, Dev Acc: 0.7692697768762677
Epoch: 9, Train Acc: 0.9923333333333333, Dev Acc: 0.787525354969574


In [118]:
conv = nn.Conv1d(100, 3, 3, padding = 'same')

In [119]:
conv(th.randn(1,100,10)).shape

torch.Size([1, 3, 10])