In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Get to the folder we are at
FOLDERNAME = 'Colab\ Notebooks/stanCodeNLP'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/stanCodeNLP


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [4]:
# Seed for same output
torch.manual_seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [19]:
# Reading in our file
raw_data = pd.read_csv('IMDBDataset.csv')

In [20]:
# Get data & labels
reviews = raw_data.review
print(reviews.head())
labels = raw_data.sentiment
print(labels)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object
0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


In [21]:
# Replace 'positive' with 1; 'negative' with 0
labels.replace({'positive':1,'negative':0}, inplace =True)

In [22]:
labels

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [23]:
patterns = ['<br />', '--', '.', ',', '!', '?', ')', '(', ';', ':', '*', '~', '_', "'", '"']
replacements = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '']

In [24]:
def preprocessing(reviews, patterns, replacements):
    lst = []
    for i in range(len(reviews)):
        review = reviews[i].lower()
        for pattern, replacement in zip(patterns, replacements):
            review = review.replace(pattern, replacement)
        lst.append(review)
    return lst

In [25]:
reviews = preprocessing(reviews, patterns, replacements)

In [26]:
num_train = 35000
num_val = 15000
longest_num_tokens = 250

In [39]:
def indexing_tokens():
    indices = {'<SOS>':0, '<EOS>':1, '<PAD>':2, '<UNK>':3}
    counter = 4
    for i in range (num_train):
        tokens = reviews[i].split()
        for token in tokens:
            if token not in indices:
                indices[token] = counter
                counter+=1
    return indices

In [44]:
def get_data(indices, longest_line_tokens, mode='train'):
    data = []
    Y = []
    if mode == 'train':
        for i in range (num_train):
            one_train_data = []
            y, tokens = labels[i], reviews[i].split()
            for token in tokens:
                one_train_data.append(indices[token])
                if len(one_train_data) == longest_line_tokens:
                    break
            while len(one_train_data) < longest_line_tokens:
                one_train_data.append(indices['<PAD>'])
            one_train_data.insert(indices['<SOS>'],0)
            one_train_data.append(indices['<EOS>'])
            data.append(one_train_data)
            Y.append(y)
    else:
        for i in range (num_train, num_train + num_val):
            one_val_data = []
            y, tokens = labels[i], reviews[i].split()
            for token in tokens:
                if token not in indices:
                    one_val_data.append(indices['<UNK>'])
                else:
                    one_val_data.append(indices[token])
                if len(one_val_data) == longest_line_tokens:
                    break
            while len(one_val_data) < longest_line_tokens:
                one_val_data.append(indices['<PAD>'])
            one_val_data.insert(indices['<SOS>'], 0)
            one_val_data.append(indices['<EOS>'])
            data.append(one_val_data)
            Y.append(y)
    return data, Y

In [45]:
# Loading Training Data & Val Data
indices = indexing_tokens()
training_data, training_labels = get_data(indices, longest_num_tokens)
val_data, val_labels = get_data(indices, longest_num_tokens, mode='val')

In [46]:
print('Number of training:', len(training_data))
print('Number of validation:', len(val_data))
print('Length of corpus:', len(indices))

Number of training: 35000
Number of validation: 15000
Length of corpus: 122545


In [47]:
# Create tensors of train & val
train_tensor = torch.tensor(training_data)
train_labels_tensor = torch.tensor(training_labels)
val_tensor = torch.tensor(val_data)
val_labels_tensor = torch.tensor(training_labels)

In [48]:
print('Train Tensor:', train_tensor.shape)
print('Val Tensor:', val_tensor.shape)

Train Tensor: torch.Size([35000, 252])
Val Tensor: torch.Size([15000, 252])


In [49]:
vocab_size = 122545
embedding_dim = 300
hidden_dim = 256
sequence_len = 252
output_dim = 2
print_every = 400
batch_size = 32

In [50]:
class MyModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        # N x 252
        embedded_data = self.embedding_layer(x) # x = [1, 122544, 1234, .....,0]
        # N x 300 x 252
        output, (h_n, c_n)= self.lstm(embedded_data)
        out = output[:,-1,:] # h_n.squeeze()
        out = nn.functional.dropout(out)
        out = self.fc(out)
        return out

In [51]:
model = MyModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model = model.cuda()

In [52]:
mini_trains = DataLoader(train_tensor, batch_size=batch_size)
mini_train_labels = DataLoader(training_labels, batch_size=batch_size)

mini_vals = DataLoader(val_tensor, batch_size=batch_size)
mini_val_labels = DataLoader(val_labels, batch_size=batch_size)

In [53]:
iterator = iter(mini_trains)
print(next(iterator).shape)

iterator = iter(mini_train_labels)
print(next(iterator).shape)

torch.Size([32, 252])
torch.Size([32])


In [54]:
# Training Procedure
def train(num_epoch, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer):
  for epoch in range(num_epoch):
    num_iters = 0
    for x, y in zip(mini_trains, mini_train_labels):
      model.train()
      x = x.to(device)
      y = y.to(device)
      scores = model(x)
      loss = loss_function(scores, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if num_iters % print_every == 0:
        evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device)
      num_iters += 1

In [55]:
# Evaluate Procedure
def evaluate_predictor(model, epoch, mini_vals, mini_val_labels, device):
  model.eval()
  with torch.no_grad():
    acc_count = 0
    for x, y in zip(mini_vals, mini_val_labels):
      x=x.to(device)
      y=y.to(device)
      scores=model(x)
      predictions=scores.max(1)[1]
      acc = predictions.eq(y).sum().item()
      acc_count += acc
    print(f'Epoch[{epoch+1}] Acc: {acc_count/len(val_data)}')

In [56]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [57]:
# Start training
train(5, model, mini_trains, mini_train_labels, mini_vals, mini_val_labels, device, loss_function, optimizer)

Epoch[1] Acc: 0.5047333333333334
Epoch[1] Acc: 0.5046666666666667
Epoch[1] Acc: 0.5007333333333334
Epoch[2] Acc: 0.5064
Epoch[2] Acc: 0.5611333333333334
Epoch[2] Acc: 0.5014
Epoch[3] Acc: 0.5089333333333333
Epoch[3] Acc: 0.6336
Epoch[3] Acc: 0.5226666666666666
Epoch[4] Acc: 0.6211333333333333
Epoch[4] Acc: 0.7505333333333334
Epoch[4] Acc: 0.8129333333333333
Epoch[5] Acc: 0.8209333333333333
Epoch[5] Acc: 0.8405333333333334
Epoch[5] Acc: 0.8430666666666666
