<a href="https://colab.research.google.com/github/ArshT/Pytorch_Practice/blob/master/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as Optim
from torchtext.legacy import data

In [2]:
TEXT = data.Field(tokenize='spacy',tokenizer_language='en_core_web_sm',include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [3]:
from torchtext.legacy import datasets

train_data,test_data = datasets.IMDB.splits(TEXT,LABEL)

In [4]:
train_data,val_data = train_data.split()

In [5]:
MAX_VOCAB_SIZE = 25000


TEXT.build_vocab(train_data,max_size = MAX_VOCAB_SIZE,vectors = 'glove.6B.100d',unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [6]:
BATCH_SIZE = 64

device = torch.device('cuda')

train_iterator,val_iterator,test_iterator = data.BucketIterator.splits((train_data,val_data,test_data),
                                                                        batch_size=BATCH_SIZE,
                                                                        sort_within_batch=True,
                                                                        device=device)

In [7]:
class Net(nn.Module):

  def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout,pad_idx):
    super(Net,self).__init__()

    self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx)

    self.lstm_layer = nn.LSTM(embedding_dim,hidden_dim,num_layers=n_layers,bidirectional=bidirectional,dropout=dropout)

    self.fc = nn.Linear(hidden_dim*2,output_dim)

    self.dropout = nn.Dropout(dropout)
  

  def forward(self,text,text_lengths):

    embedded =  self.dropout(self.embedding(text))

    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,text_lengths.to('cpu'))

    packet_output,(hidden,cell) = self.lstm_layer(packed_embedded)

    hidden = self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))

    output = self.fc(hidden)

    return output

In [8]:
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

net = Net(VOCAB_SIZE,
          EMBEDDING_DIM,
          HIDDEN_DIM,
          OUTPUT_DIM,
          N_LAYERS,
          BIDIRECTIONAL,
          DROPOUT,
          PAD_IDX
          )
net.to(device)

print(net)

Net(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (lstm_layer): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [9]:
pretrained_embeddings = TEXT.vocab.vectors
net.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.9850,  0.7400,  2.6182,  ..., -0.6623, -1.2012,  1.1092],
        [-1.1947, -0.6932, -0.3686,  ...,  1.3607, -0.4500,  0.6598],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7221,  0.7693, -0.2417,  ..., -1.1507,  1.2279, -0.2337],
        [-1.7035,  1.4701,  0.6830,  ...,  0.4607, -0.1707, -1.2048],
        [-0.2834, -1.5427, -0.0986,  ...,  1.0882,  0.1482,  0.1804]],
       device='cuda:0')

In [10]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

net.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
net.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(net.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7221,  0.7693, -0.2417,  ..., -1.1507,  1.2279, -0.2337],
        [-1.7035,  1.4701,  0.6830,  ...,  0.4607, -0.1707, -1.2048],
        [-0.2834, -1.5427, -0.0986,  ...,  1.0882,  0.1482,  0.1804]],
       device='cuda:0')


In [11]:
optimizer = Optim.Adam(net.parameters())

In [12]:
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

BCEWithLogitsLoss()

In [13]:
def binary_accuracy(model,iterator,criterion):
  
  EPOCH_loss = 0
  EPOCH_acc = 0

  for batch in iterator:
    model.eval()

    text,text_lengths = batch.text
    labels = batch.label
    labels = labels.reshape(labels.shape[0],1)

    pred = model(text,text_lengths)

    rounded_preds = torch.round(torch.sigmoid(pred))
    correct = (rounded_preds == labels).float()
    acc = correct.sum() / len(correct)

    loss = criterion(pred,labels)

    EPOCH_loss += loss.item()
    EPOCH_acc  += acc.item()
  
  return EPOCH_loss / len(iterator), EPOCH_acc / len(iterator)

In [14]:
n_epochs = 10

for epoch in range(n_epochs):
  train_epoch_loss = 0
  train_epoch_acc = 0

  for batch in train_iterator:
    net.train()
    optimizer.zero_grad()

    text,text_lengths = batch.text
    labels = batch.label
    labels = labels.reshape(labels.shape[0],1)

    predictions = net(text,text_lengths)
    loss = criterion(predictions,labels)
    loss.backward()
    optimizer.step()

    train_epoch_loss += loss.item()

    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == labels).float()
    acc = correct.sum() / len(correct)

    train_epoch_acc  += acc.item()
  
  train_epoch_acc /= len(train_iterator)
  train_epoch_loss /= len(train_iterator)

  val_loss,val_acc = binary_accuracy(net,val_iterator,criterion)
  print("Epoch:",epoch+1)
  print("train:",train_epoch_loss,train_epoch_acc)
  print("val:",val_loss,val_acc)
  print()

Epoch: 1
train: 0.6314426115612044 0.6316475496239906
val: 0.50266882524652 0.768802966101695

Epoch: 2
train: 0.5303703298533919 0.7347578858807139
val: 0.3983468602774507 0.8289194915254238

Epoch: 3
train: 0.40500562458577816 0.8257706596033416
val: 0.3528545486219859 0.8560646186440678

Epoch: 4
train: 0.34612352993801565 0.8510003912187841
val: 0.31819902246786375 0.8674523305084746

Epoch: 5
train: 0.3026000563680691 0.8751547836909329
val: 0.2966156202857777 0.8866525423728814

Epoch: 6
train: 0.2839069991868778 0.8836597369535126
val: 0.312440714341099 0.8907574152542372

Epoch: 7
train: 0.23174012202198488 0.9076756386861314
val: 0.2713197186841803 0.8997616525423728

Epoch: 8
train: 0.20655348619622907 0.9207752216471373
val: 0.2613512265606452 0.8989671610169492

Epoch: 9
train: 0.17923697375141792 0.9331334724913548
val: 0.26635889027078274 0.8990112997717776

Epoch: 10
train: 0.16262604555890073 0.9380132299270073
val: 0.3085264937983731 0.8987023305084746

