In [None]:
import nltk
import numpy as np
import torch
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import pad
nltk.download('stopwords')
nltk.download('wordnet')

BATCH_SIZE = 64

In [2]:
# read GloVe
# citation: https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def loadGloveModel(File):
    print("Loading Glove Model")
    with open(File, 'r', encoding='utf-8') as f:
        gloveModel = {}
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(value) for value in splitLines[1:]])
            gloveModel[word] = wordEmbedding
        print(len(gloveModel)," words loaded!")
        return gloveModel

In [3]:
# read movie reviews data
# tokenize -> lowercase -> remove stopwords -> lemmatize
def get_movie_reviews_data(path, data_type = "train"):
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer() 
    english_stopwords = stopwords.words('english')
    if data_type == "train":
        with open(path) as f:
            lines = list(f.readlines())[1:]
            sentences = [line.split('\t')[2] for line in lines]
            labels = [int(line.split('\t')[3]) for line in lines]
            tokenized_sentences = [[lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(sentence) if token.lower() in word2vec_dict and token.lower() not in english_stopwords] for sentence in sentences]
            zipped = [(x, y) for x, y in zip(tokenized_sentences, labels) if x != []]
            tokenized_sentences = [x for x, y in zipped]
            labels = [y for x, y in zipped]
            return tokenized_sentences, labels
    elif data_type == "test":
        with open(path) as f:
            lines = list(f.readlines())[1:]
            sentences = [line.split('\t')[2] for line in lines]
            tokenized_sentences = [[lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(sentence) if token.lower() in word2vec_dict and token.lower() not in english_stopwords] for sentence in sentences]
            tokenized_sentences = [x for x in tokenized_sentences if x != []]
            return tokenized_sentences, None

In [4]:
# get embeddings
def get_embeddings(tokenized_sentences, word2vec_dict):
    return [np.array([word2vec_dict[word] for word in x if word in word2vec_dict]) for x in tokenized_sentences]

In [None]:
# custom Dataset class
class MovieReviewsData(Dataset):
    def __init__(self, X, Y = None):
        self.maxlen = max(len(x) for x in X)
        self.X = [pad(torch.FloatTensor(x), (0, 0, 0, self.maxlen - len(x))) for x in X]
        if Y is not None:
            self.Y = torch.LongTensor(Y)
        else:
            self.Y = None
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.Y is not None:
            return self.X[idx], self.Y[idx]
        else:
            return self.X[idx]

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!kaggle competitions download -c sentiment-analysis-on-movie-reviews

In [None]:
# load data
!unzip *.zip
# !unzip /content/competitions/sentiment-analysis-on-movie-reviews/\*.zip
word2vec_dict = loadGloveModel('./glove.6B.100d.txt')

train_tokenized_sentences, train_Y = get_movie_reviews_data("./train.tsv", "train")
test_tokenized_sentences, _ = get_movie_reviews_data("./test.tsv", "test")
train_X = get_embeddings(train_tokenized_sentences, word2vec_dict)
test_X = get_embeddings(test_tokenized_sentences, word2vec_dict)

In [14]:
#save word2vec_dict
import pickle
num = 5
per_num = len(word2vec_dict)//num
word2vec_list = list(word2vec_dict.items())
for i in range(num):
  if i != num-1:
    sub_word2vec = dict(word2vec_list[i*per_num:(i+1)*per_num])
  else:
    sub_word2vec = dict(word2vec_list[i*per_num:])
  pickle.dump(sub_word2vec,open("./drive/My Drive/good_or_bad/word2vec/word2vec_100d/word2vec_100d_" + str(i+1) + ".pt",'wb'))

In [None]:
#save training data
np.save('/content/drive/MyDrive/good_or_bad/data/train_X_100d_dict.npy', train_X)
np.save('/content/drive/MyDrive/good_or_bad/data/train_Y_100d_dict.npy', train_Y)
np.save('/content/drive/MyDrive/good_or_bad/data/test_X_100d_dict.npy', test_X)

In [12]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [17]:
d = pickle.load(open("/content/drive/MyDrive/good_or_bad/sample_words_embeddings.pt",'rb'))
print(d.keys())

dict_keys(['good', 'best', 'great', 'surprise', 'interesting', 'bad', 'worst', 'boring', 'terrible', 'horrible'])


In [24]:
#sample embeddings
movie_review_words = {'easy':word2vec_dict['easy']
                      ,"like":word2vec_dict['like']
                      , "device":word2vec_dict['device']
                      ,"great": word2vec_dict['great']
                      ,"love" :word2vec_dict['love']
                      ,'price' :word2vec_dict['price']
                      , 'old':word2vec_dict['old']
                      , 'game':word2vec_dict['game']
                      , 'well':word2vec_dict['well']
                      ,'better' :word2vec_dict['better']
                      ,'horrible' :word2vec_dict['horrible']
                      ,'bad' :word2vec_dict['bad']
                      }
pickle.dump(movie_review_words,open("/content/drive/MyDrive/good_or_bad/sample_word_embedding/100d/amazon_products_sample_embeddings.pt",'wb'))