In [27]:
import torch
import feather
import joblib
import pandas as pd
import numpy as np
import os
import sys
import operator
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [121]:
GLOVE_DIR = '../../data/embeddings'
MAX_SEQUENCE_LENGTH = 256
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# Load preprocessed data

In [122]:
df_rev_balanced = feather.read_dataframe("../../data/balanced_reviews.feather")

In [4]:
tokenizer = joblib.load('../../assets/tokenizer.pickle')

In [5]:
with pd.HDFStore('../../assets/yelp_x_y_test_train.h5') as h:
    X_train = h['X_train'].values
    X_test = h['X_test'].values
    y_train = h['y_train'].values
    y_test = h['y_test'].values
WORD_INDEX_SORTED = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))

# Load pretrained embeddings

In [6]:
print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, './glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [7]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(WORD_INDEX_SORTED))
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [120]:
X_train.shape

(96308, 256)

In [8]:
embedding_matrix = torch.tensor(embedding_matrix)

In [9]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [113]:
class Model(nn.Module):
    def __init__(self, weights_matrix, hidden_size, output_size):
        super(Model, self).__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=False)
        self.linear = nn.Linear(hidden_size, output_size)
#         self.sigmoid = nn.Sigmoid()

    def forward(self, inp):
        h_embedding = self.embedding(inp)
        h_lstm, _ = self.lstm(h_embedding)
#         out = self.sigmoid(self.linear(h_lstm))
        out = self.linear(h_lstm)
        return out

In [114]:
#instantiate the model 
model = Model(weights_matrix=embedding_matrix,
                   hidden_size=64,
                   output_size=1).cuda()
#defining loss_function and optimizer
# loss_function = nn.BCELoss(reduction='mean')
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [115]:
from torch.nn.utils.rnn import pad_packed_sequence

In [116]:
train = TensorDataset(torch.tensor(X_train).to(torch.int64).cuda(), torch.tensor(y_train).cuda())

In [117]:
trainloader = DataLoader(train, batch_size=128)

In [119]:
for epoch in range(1, 21):
    toy_lstm.train()
    for data, target in trainloader:
        optimizer.zero_grad()
        output = model(data)
        print(data.shape)
        print(target.shape)
        print(output.shape)
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()

torch.Size([128, 256])
torch.Size([128, 1])
torch.Size([128, 256, 1])


ValueError: Target size (torch.Size([128, 1])) must be the same as input size (torch.Size([128, 256, 1]))