In [2]:
import nltk
import numpy as np
import torch
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import pad

BATCH_SIZE = 64

In [3]:
# read GloVe
# citation: https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def loadGloveModel(File):
    print("Loading Glove Model")
    with open(File, 'r', encoding='utf-8') as f:
        gloveModel = {}
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(value) for value in splitLines[1:]])
            gloveModel[word] = wordEmbedding
        print(len(gloveModel)," words loaded!")
        return gloveModel

In [4]:
# read movie reviews data
# tokenize -> lowercase -> remove stopwords -> lemmatize
def get_movie_reviews_data(path, data_type = "train"):
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer() 
    english_stopwords = stopwords.words('english')
    if data_type == "train":
        with open(path) as f:
            lines = list(f.readlines())[1:]
            sentences = [line.split('\t')[2] for line in lines]
            labels = [int(line.split('\t')[3]) for line in lines]
            tokenized_sentences = [[lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(sentence) if token.lower() in word2vec_dict and token.lower() not in english_stopwords] for sentence in sentences]
            zipped = [(x, y) for x, y in zip(tokenized_sentences, labels) if x != []]
            tokenized_sentences = [x for x, y in zipped]
            labels = [y for x, y in zipped]
            return tokenized_sentences, labels
    elif data_type == "test":
        with open(path) as f:
            lines = list(f.readlines())[1:]
            sentences = [line.split('\t')[2] for line in lines]
            tokenized_sentences = [[lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(sentence) if token.lower() in word2vec_dict and token.lower() not in english_stopwords] for sentence in sentences]
            tokenized_sentences = [x for x in tokenized_sentences if x != []]
            return tokenized_sentences, None

In [5]:
# get embeddings
def get_embeddings(tokenized_sentences, word2vec_dict):
    return [np.array([word2vec_dict[word] for word in x]) for x in tokenized_sentences]

In [6]:
# custom Dataset class
class MovieReviewsData(Dataset):
    def __init__(self, X, Y = None):
        self.maxlen = max(len(x) for x in X)
        self.X = [pad(torch.FloatTensor(x), (0, 0, 0, self.maxlen - len(x))) for x in X]
        if Y is not None:
            self.Y = torch.LongTensor(Y)
        else:
            self.Y = None
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.Y is not None:
            return self.X[idx], self.Y[idx]
        else:
            return self.X[idx]

In [7]:
# load data
word2vec_dict = loadGloveModel('glove.42B.300d/glove.42B.300d.txt')
train_tokenized_sentences, train_Y = get_movie_reviews_data("sentiment-analysis-on-movie-reviews/train.tsv", "train")
test_tokenized_sentences, _ = get_movie_reviews_data("sentiment-analysis-on-movie-reviews/test.tsv", "test")
train_X = get_embeddings(train_tokenized_sentences, word2vec_dict)
test_X = get_embeddings(test_tokenized_sentences, word2vec_dict)

Loading Glove Model
1917495  words loaded!


In [8]:
np.save('train_X.npy', train_X)
np.save('train_Y.npy', train_Y)
np.save('test_X.npy', test_X)

In [9]:
train_dataset = MovieReviewsData(train_X, train_Y)
test_dataset = MovieReviewsData(test_X)
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = BATCH_SIZE)
test_loader = DataLoader(test_dataset, shuffle = False, batch_size = BATCH_SIZE)