In [1]:
import numpy as np
import pandas as pd

from nltk.data import find
import gensim

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [2]:
with open("imdb.pos") as f:
    pos = f.read().splitlines()

with open("imdb.neg") as f:
    neg = f.read().splitlines()

df_pos = pd.DataFrame(pos, columns=["review"])
df_pos["sentiment"] = 1
df_neg = pd.DataFrame(neg, columns=["review"])
df_neg["sentiment"] = 0
df = pd.concat([df_pos, df_neg], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,review,sentiment
0,Very disappointed,0
1,A Nutshell Review : Jack Reacher,1
2,something I want to say about the movie,1
3,An Awesome Action Film !!!,1
4,FOUR STAR - Sophisticated romantic comedy lock...,1
...,...,...
599995,Road to your dreams,1
599996,Love Rutger Hauer but WHY,0
599997,"Amazing film with a unique vision , and very f...",1
599998,"Genuinely awful , surprised I watched it to th...",0


In [3]:
max_sentence_length = df["review"].apply(lambda x: len(x.split(" "))).max()
max_sentence_length

62

In [4]:
df["seq_len"] = df["review"].apply(lambda x: len(x.split(" ")))
df

Unnamed: 0,review,sentiment,seq_len
0,Very disappointed,0,2
1,A Nutshell Review : Jack Reacher,1,6
2,something I want to say about the movie,1,8
3,An Awesome Action Film !!!,1,5
4,FOUR STAR - Sophisticated romantic comedy lock...,1,20
...,...,...,...
599995,Road to your dreams,1,4
599996,Love Rutger Hauer but WHY,0,5
599997,"Amazing film with a unique vision , and very f...",1,10
599998,"Genuinely awful , surprised I watched it to th...",0,10


In [5]:
def embed_sentence(sentence, pad_to):
    words = sentence.split(" ")
    X = np.zeros((pad_to, 300))
    for i in range(min(pad_to, len(words))):
        if words[i] in word2vec:
            X[i] = word2vec[words[i]]
    return X

In [6]:
def embed_batch(txt_batch, pad_to):
    X_batch = np.zeros((len(txt_batch), pad_to, 300))
    for i in range(len(txt_batch)):
        X_batch[i] = embed_sentence(txt_batch[i], pad_to)
    return X_batch

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df[["review", "seq_len"]], df["sentiment"], test_size=0.2, random_state=42
)

In [11]:
class LSTMforSentimentAnalysis(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, batch_size, device):
        super(LSTMforSentimentAnalysis, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.batch_size = batch_size
        self.device = device
        self.to(device)

    def forward(self, X): # X is a batch of not embedded sentences 
        # we embed the batch and pass it to the LSTM
        sentences = X["review"].values
        seq_len = X["seq_len"].values
        batch_size = len(X)
        X = embed_batch(sentences, max_sentence_length)
        X = torch.from_numpy(X).float().to(self.device)
        hidden = torch.zeros(1, batch_size, self.hidden_dim).to(self.device)
        cell = torch.zeros(1, batch_size, self.hidden_dim).to(self.device)
        out, (hidden, cell) = self.lstm(X, (hidden, cell)) # out is of shape (batch_size, seq_len, hidden_dim)

        # now in out we have the hidden state for each word in the sentence
        # we need to construct the batch of hidden states to pass to the fc layer
        # we take the last hidden state for each sentence BEFORE the padding starts
        # thanks to the seq_len column we added to the dataframe

        hidden_batch = out[torch.arange(batch_size), seq_len-1, :].to(self.device) # of shape (batch_size, hidden_dim)
        # we pass the hidden state of the last word of each sentence to the fc layer
        out = self.fc(hidden_batch)
        return out
        

In [13]:
n_epochs = 10
batch_size = 256

model = LSTMforSentimentAnalysis(300, 100, 1, batch_size, "cuda")

# training
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


for epoch in range(n_epochs):
    avg_loss = 0
    model.train()
    for i in range(0, len(X_train), batch_size):
        batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        y_batch = torch.from_numpy(y_batch.values).to("cuda")
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output.squeeze(), y_batch.float())
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
        if i % 500 == 0:
            print("Batch: %d/%d, loss: %.5f" % (i//batch_size, len(X_train)//batch_size, loss.item()))
    avg_loss /= batch_size

    # evaluation
    print("Evaluating...")
    model.eval()
    y_pred = []

    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        output = model(batch)
        y_pred.extend(output.squeeze().detach().cpu().numpy().tolist())

    y_pred = np.array(y_pred)
    y_pred = F.sigmoid(torch.from_numpy(y_pred)).numpy()
    y_pred = np.round(y_pred)
    accuracy = np.mean(y_pred == y_test.values)
    print("Epoch: %d/%d, avg loss: %.5f, test accuracy: %.5f" % (epoch + 1, n_epochs, avg_loss, accuracy))

Batch: 0/1875, loss: 0.69400
Batch: 125/1875, loss: 0.50888
Batch: 250/1875, loss: 0.44191
Batch: 375/1875, loss: 0.50317
Batch: 500/1875, loss: 0.46554
Batch: 625/1875, loss: 0.48057
Batch: 750/1875, loss: 0.46758
Batch: 875/1875, loss: 0.43477
Batch: 1000/1875, loss: 0.49115
Batch: 1125/1875, loss: 0.44191
Batch: 1250/1875, loss: 0.48562
Batch: 1375/1875, loss: 0.46292
Batch: 1500/1875, loss: 0.47939
Batch: 1625/1875, loss: 0.46111
Batch: 1750/1875, loss: 0.40438
Evaluating...
Epoch: 1/10, avg loss: 3.48113, test accuracy: 0.77406
Batch: 0/1875, loss: 0.42574
Batch: 125/1875, loss: 0.39063
Batch: 250/1875, loss: 0.42462
Batch: 375/1875, loss: 0.47628
Batch: 500/1875, loss: 0.42045
Batch: 625/1875, loss: 0.45970
Batch: 750/1875, loss: 0.42022
Batch: 875/1875, loss: 0.40574
Batch: 1000/1875, loss: 0.45170
Batch: 1125/1875, loss: 0.40954
Batch: 1250/1875, loss: 0.45902
Batch: 1375/1875, loss: 0.43964
Batch: 1500/1875, loss: 0.45340
Batch: 1625/1875, loss: 0.43294
Batch: 1750/1875, loss:

KeyboardInterrupt: 

In [15]:
torch.save(model.state_dict(), "./lstm-sentiment.model")

In [16]:
def manual_test(sentence):
    with torch.no_grad():
        model.eval()
        length = len(sentence.split(" "))
        df = pd.DataFrame(data={"review": [sentence], "seq_len":[length]})
        return torch.sigmoid(model(df).to("cpu")).item()

In [None]:
manual_test("Whoaaaa, that movie was sooooo greaaaat")