In [1]:
import re
import numpy as np
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn

In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    dataset=[]
    with open(filename) as f:
        line = f.readline()
        cpt=1
        skip=0
        while line :
            cleanline = clean_str(f.readline()).split()
            if cleanline: 
                dataset.append(cleanline)
            else: 
                line = f.readline()
                skip+=1
                continue
            if limit > 0 and cpt >= limit: 
                break
            line = f.readline()
            cpt+=1        

        print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset

In [3]:
LIM = 5000
txtfile = "./imdb/imdb.pos"
postxt = loadTexts(txtfile,limit=LIM)

txtfile = "./imdb/imdb.neg"
negtxt = loadTexts(txtfile,limit=LIM)

Load  5000  lines from  ./imdb/imdb.pos  /  1  lines discarded
Load  5000  lines from  ./imdb/imdb.neg  /  1  lines discarded


In [4]:
# split into train / dev / test
train_pos_indices = np.random.choice(len(postxt), size=int(0.6*LIM), replace=False)
# create dev excluding train
dev_pos_indices = np.random.choice(list(set(range(len(postxt))) - set(train_pos_indices)), size=int(0.2*LIM), replace=False)
# create test excluding train and dev
test_pos_indices = list(set(range(len(postxt))) - set(train_pos_indices) - set(dev_pos_indices))

train_neg_indices = np.random.choice(len(negtxt), size=int(0.6*LIM), replace=False)
# create dev excluding train
dev_neg_indices = np.random.choice(list(set(range(len(negtxt))) - set(train_neg_indices)), size=int(0.2*LIM), replace=False)
# create test excluding train and dev
test_neg_indices = list(set(range(len(negtxt))) - set(train_neg_indices) - set(dev_neg_indices))

train_pos = [postxt[i] for i in train_pos_indices]
dev_pos = [postxt[i] for i in dev_pos_indices]
test_pos = [postxt[i] for i in test_pos_indices]

train_neg = [negtxt[i] for i in train_neg_indices]
dev_neg = [negtxt[i] for i in dev_neg_indices]
test_neg = [negtxt[i] for i in test_neg_indices]

# create train / dev / test sets
train = [(x,1) for x in train_pos] + [(x,0) for x in train_neg]
dev = [(x,1) for x in dev_pos] + [(x,0) for x in dev_neg]
test = [(x,1) for x in test_pos] + [(x,0) for x in test_neg]

In [5]:
# make a dictionary of all words in the training set
word_dict = {}
for sent, _ in train:
    for word in sent:
        if word not in word_dict:
            word_dict[word] = len(word_dict)

def sent2tensor(sent, word_dict):
    # convert sentence to list of indices, if a word is not in the dictionary, skip it
    idxs = [word_dict[word] if word in word_dict else -1 for word in sent]
    # remove words not in dictionary
    idxs = [idx for idx in idxs if idx >= 0]
    if idxs == []:
        return None
    return th.LongTensor(idxs)

train_data = [(sent2tensor(sent, word_dict), label) for sent, label in train]
dev_data = [(sent2tensor(sent, word_dict), label) for sent, label in dev]
test_data = [(sent2tensor(sent, word_dict), label) for sent, label in test]

# remove empty sentences
train_data = [x for x in train_data if x[0] is not None]
dev_data = [x for x in dev_data if x[0] is not None]
test_data = [x for x in test_data if x[0] is not None]

First: Let's do the linear classifier!

In [6]:
import gensim
from nltk.data import find

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
embedding_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

This time, we will build the dataset in the already vectorized space.

In [7]:
# create embedding for all datapoints
train_data_test = [[[embedding_model[word] for word in sent if word in embedding_model], label] for sent, label in train]
dev_data_test = [[[embedding_model[word] for word in sent if word in embedding_model], label] for sent, label in dev]
test_data_test = [[[embedding_model[word] for word in sent if word in embedding_model], label] for sent, label in test]

# remove empty sentences
train_data_test = [x for x in train_data_test if x[0] != []]
dev_data_test = [x for x in dev_data_test if x[0] != []]
test_data_test = [x for x in test_data_test if x[0] != []]

# convert to tensors
train_data_test = [(th.FloatTensor([x[0]]), x[1]) for x in train_data_test]
dev_data_test = [(th.FloatTensor([x[0]]), x[1]) for x in dev_data_test]
test_data_test = [(th.FloatTensor([x[0]]), x[1]) for x in test_data_test]

  train_data_test = [(th.FloatTensor([x[0]]), x[1]) for x in train_data_test]


In [12]:
class LSTMOW(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers=1):
        super(LSTMOW, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (ag.Variable(th.zeros(self.n_layers, 1, self.hidden_dim)),
                ag.Variable(th.zeros(self.n_layers, 1, self.hidden_dim)))
    
    def forward(self, x):
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y_pred = self.linear(lstm_out[:, -1, :])
        return y_pred

In [13]:
model = LSTMOW(300, 100, 2, n_layers=1)
loss_function = nn.BCELoss()

optimizer = th.optim.SGD(model.parameters(), lr=0.1)
epochs = 10

In [15]:
for epoch in range(epochs):
    for sentence, label in train_data_test:
        model.zero_grad()
        model.hidden = model.init_hidden()
        y_pred = model(sentence)
        loss = loss_function(y_pred, ag.Variable(th.FloatTensor([label])))
        loss.backward()
        optimizer.step()

ValueError: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([1, 2])) is deprecated. Please ensure they have the same size.