In [1]:
import pandas as pd
import numpy as np
import string
import collections
import nltk
import nltk.corpus as corpus
import nltk.stem as stem
import torch
import torch.utils as utils
import torch.nn as nn
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


### Loading the data

In [2]:
# loading inputs & labels into a dataframe
df = pd.read_csv("data/imbd_dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# understanding some important features of the data
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [4]:
print(df.isnull().values.any()) # checking for null values
df.drop_duplicates(inplace=True) # removing duplicate reviews

False


In [5]:
# making sure those reviews were removed
print(df["review"].shape )
print(df["sentiment"].shape)

(49582,)
(49582,)


### Preprocessing

In [6]:
words = set(corpus.words.words())
punc = string.punctuation
wordnet = corpus.wordnet
lemmatizer = stem.WordNetLemmatizer().lemmatize

# gets part of speech (not using)
def pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    mappings = dict(J=wordnet.ADJ, N=wordnet.NOUN, V=wordnet.VERB, R=wordnet.ADV)
    return mappings.get(tag, wordnet.ADJ)

def remove_punc(word):
    i = word.find('br')
    if i != -1:
        if word[i:i+3] == 'br':           
            word = word.replace('br', '')   
    word = ''.join([c for c in word if c not in punc])
    return word

# creating a function that removes irrelevant characters, punctuation, stopwords, and break tags
def augment(text, stopwords=True):
    data = []
    for review in text:
        if stopwords:
            stopwords = set(corpus.stopwords.words('english')) # words irrelevant to the sentiment analysis
        else:
            stopwords = set()            
        review = review.lower() # lowercase
        review = review.split(' ') # split into tokens by space
        review = [remove_punc(word) for word in review] # remove punctuation
        review = [w for w in review if w not in stopwords and w] # remove stopwords and non-words
        data.append(' '.join(review)) # transform back to sequence


    return np.array(data)


# create copy array of 20 reviews
test = np.array(df['review'][:20].copy())
print(test[1])
print()
print(augment(test)[1])

reviews = augment(df['review'])
print(reviews.shape)

A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly well done.



### Creating Embeddings to Encode to Reviews

In [7]:
allwords = ' '.join([r for r in reviews]).split()
n = len(set(allwords))
allwords = collections.Counter(allwords) # count all words
mostcommon = allwords.most_common(n) # most common to least common (descending)
embeddings = {w: i + 1 for i, (w, c) in enumerate(mostcommon)} # create embeddings (start at 1 to use 0 as padding)
label_encoding = {'positive': 1, 'negative': 0}

# encodes the reviews based on the embeddings
def encode(x, embeddings):
    encodings = []
    for review in x:
        # creates encoding of review based on embedding mappings (if word not in mapping default to 0)
        encodings.append([embeddings.get(w, 0) for w in review.split()])

    return np.array(encodings)


x = encode(reviews, embeddings)
y = encode(df['sentiment'], label_encoding).squeeze()
print(x.shape, y.shape)
print(len(max(x, key=len)))
print(y[1])

(49582,) (49582,)
1437
1


  return np.array(encodings)


### Defining Sequences and adding padding

In [8]:
# adds pad or removes embeddings if the review is too short or too long

maxlen = 150

# takes encoded vectors and pads/cuts them to the max length
def padvec(x, maxlen=None):
    sequences = []
    for review in x:
        n = len(review)
        if n < maxlen:
            review = np.pad(review, (maxlen - n, 0), 'constant', constant_values=0)
        elif n > maxlen:
            review = review[:maxlen]
            # print(len(review))
        sequences.append(review)
    return np.array(sequences)

seq = padvec(x, maxlen)

print(len(min(seq, key=len)) == len(max(seq, key=len))) # validating the function works

True


### Creating Training, Test, and Validation Sets

In [9]:
# splitting data into 80% training 20% testing 
split = 0.8
n = len(seq)
x_train, y_train = seq[:int(n * split)], y[:int(split * n)]
x_test, y_test = seq[int(n * split):], y[int(split * n):]

# making sure dims align
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(39665, 150) (39665,)
(9917, 150) (9917,)


In [10]:
training = utils.data.TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
testing = utils.data.TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

### Dataloaders

In [11]:
trainloader = utils.data.DataLoader(training, shuffle=True, batch_size=64, drop_last=True)
testloader = utils.data.DataLoader(testing, shuffle=True, batch_size=64, drop_last=True)

# getting a basic sample
trainiter = iter(trainloader)
data, labels = trainiter.next()
print(data[0])
print(labels[0])

tensor([    31,    269,   1735,      2,  12885,     23,    282,  33921,     61,
           481,    263,     54,    410,    178,    471,    612,   6584,     22,
             3,   3332,    337,   4952,  32525,  11068,  33920,    383,  69105,
         12885,    380,  69105, 118307,    424,     70,    230,     89,     49,
          6004,  14032,    721,   5166,   1169,    289,  15506,  13245,   3748,
         69106,     32,    661,   7002,     76,    721,   4037,    120,   2178,
           555,   2129,   9468,   1868,    553,   1544,   2178,     11,    155,
            89,     49,  21071,   1805,    418,    102,    632,    208,     52,
          4104,    441,   3452,   4858,  12391,    168,    842,     75,    318,
           693,   5006,  12885,    743,   1845,   2603,   1675,   4885,   3255,
          8903,   2153,   2569,   4468,     14,     38,   1868,    383,     49,
            60,   1574,    665,    134,    273,   1624,      3,   2823,    720,
             2,    390,    893,   1384, 

### Building LSTM 

In [12]:

class LSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, layers, dropout=0.5):
        super().__init__()
        self.output_size = output_size
        self.layers = layers
        self.hidden_dim = hidden_dim

        # embedding and lstm layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, 
                            num_layers=layers, dropout=dropout, batch_first=True)
        self.drop = nn.Dropout(0.3)
        # linear layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x, hidden):
        batch_size = x.size(0)

        # outputing values from lstm based on embeddings
        x = self.embedding(x)
        x, hidden = self.lstm(x, hidden)

        # stack outputs from lstm layers, drop outputs, compute sigmoid
        x = x.contiguous().view(-1, self.hidden_dim)
        x = self.drop(x)
        x = self.fc(x)
        x = self.sigmoid(x)

        # reshape to have batch size size first
        x = x.view(batch_size, -1)
        x = x[:, -1]

        return x, hidden

    def init_hidden(self, batch_size):
        # create new tensors initialized to zero for hidden state & lstm cell state
        w = next(self.parameters()).data
        hidden = (w.new(self.layers, batch_size, self.hidden_dim).zero_(), w.new(self.layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

### Trianing & Testing

In [13]:
vocab_size = len(embeddings) + 1 
output_size = 1 # positive or negative sentiment
embedding_dim = 400 # vector size of embeddings
hidden_dim = 256 # neurons
layers = 2 # number of lstm layers

In [14]:


# trains the lstm network
def train(net, trainloader, optimizer, loss, clip=5, verbose=0.25, batch_size=64, epochs=3):
    net.train()
    n = len(trainloader.dataset)

    for epoch in range(epochs):
        batches = 0
        h = net.init_hidden(batch_size) # init hidden state

        for i, data in enumerate(trainloader, 0):
            # grab input and batch infp
            inputs, labels = data
            batch_size = len(inputs)
            
            h = tuple([tensor.data for tensor in h]) # create new tensors for each hidden state

            net.zero_grad() # compute new gradients

            # transform data and get prediction
            inputs = inputs.type(torch.LongTensor)
            outputs, h = net(inputs, h)

            # find loss, and update weights according to computed gradient
            loss = loss_fn(outputs.squeeze(), labels.float())
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip) # prevent exploding gradient
            optimizer.step() # make step against gradient (slope)

            batches += batch_size
            if (i + 1) % (int(len(trainloader) * verbose)) == 0:
                print(f"epoch: {epoch + 1}/{epochs}\nsamples trained: {batches}/{n}\nloss: {loss.item()}")
        print(f'epoch complete {n}/{n} trained')
    print(f'training complete')

In [15]:
# test peformance of network
def test(net, testloader, loss_fn, batch_size=64):
    net.eval() # indicate to layers model is being tested
    n = len(testloader.dataset)
    loss = 0
    correct = 0
    h = net.init_hidden(batch_size) # init first hidden state
    
    # find loss & num correct from predictions
    for inputs, labels in testloader:

        h = tuple([tensor.data for tensor in h])

        inputs = inputs.type(torch.LongTensor)
        outputs, h = net(inputs, h)
        loss += loss_fn(outputs.squeeze(), labels.float())
        pred = torch.round(outputs.squeeze()) # round to nearest int
        correct += pred.eq(labels.float().view_as(pred)).sum().item()  # total correct in batch

    loss /= n # avg the loss
    print(f"avg loss: {loss} acc: {correct / n}")

In [16]:

net = LSTM(vocab_size, output_size, embedding_dim, hidden_dim, layers) # init lstm network
loss_fn = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
print(net) # viewing network architecture

LSTM(
  (embedding): Embedding(167965, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (drop): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [17]:
train(net, trainloader, optimizer, loss_fn, verbose=0.25, batch_size=64, epochs=2)

epoch: 1/2
samples trained: 9856/39665
loss: 0.4562649726867676
epoch: 1/2
samples trained: 19712/39665
loss: 0.3150315284729004
epoch: 1/2
samples trained: 29568/39665
loss: 0.36512449383735657
epoch: 1/2
samples trained: 39424/39665
loss: 0.29565709829330444
epoch complete 39665/39665 trained
epoch: 2/2
samples trained: 9856/39665
loss: 0.29730331897735596
epoch: 2/2
samples trained: 19712/39665
loss: 0.12457650154829025
epoch: 2/2
samples trained: 29568/39665
loss: 0.3814222812652588
epoch: 2/2
samples trained: 39424/39665
loss: 0.25582391023635864
epoch complete 39665/39665 trained
training complete


In [18]:
test(net, testloader, loss_fn, batch_size=64)

avg loss: 0.004954873584210873 acc: 0.8594332963597863


### Saving Model

In [19]:
torch.save(net.state_dict(), 'models/imbd.pth')

In [20]:
def predict(net, maxlen=150):
    reviews = []
    # get review(s)
    while True:
        review = input("input a review:")
        if review:
            reviews.append(review)
        elif reviews:
            break

    # transform review to proper tensor
    inputs = augment(reviews)
    inputs = encode(inputs, embeddings)
    inputs = padvec(inputs, maxlen=maxlen)
    inputs = torch.from_numpy(inputs)
    inputs = inputs.type(torch.LongTensor)
    batch_size = inputs.size(0)

    net.eval() # indicate to layers not to train

    # make prediction
    h = net.init_hidden(batch_size)
    outputs, h = net(inputs, h)
    pred = torch.round(outputs.squeeze())
    pred = pred.tolist()

    # sigular predictions
    if type(pred) is float:
        pred = [pred]
    
    # display prediction
    for i, label in enumerate(pred):
        print(f"{reviews[i]} is a {'positive' if label else 'negative'} review")

In [21]:
predict(net)

I found this movie to be really good. I quite enjoyed spiderman far from home. is a positive review
I think Morbius was horrible. it was a bad movie and I honestly would never recommend anyone see the movie. is a negative review
I cant wait for Thor Love and Thunder. I think it will be a good movie and honestly, I suspect it will be the second bezt of all the Thor movies. is a positive review


  return np.array(encodings)
