#Imports

In [26]:
import torch,torchtext
from torchtext.legacy import data
from torchtext.legacy import datasets 
import random
import spacy
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

#Prepare data

First step is to generate biagram features as described in the paper

In [2]:
def bigramF(sentence):
  '''
  l=sentence
  unigrams=[]
  bigrams= []

  for i in range(len(l)):
    
    unigrams.append( l[i])
    if i <len(l)-1:
       
      bigrams = bigrams +[l[i]+' '+l[i+1]]
  return unigrams+bigrams
  '''
   
  for ngram in set(zip(*[sentence[i:]for i in range(2)])):
    sentence.append(' '.join (ngram))
  return sentence

Example of how this looks like

In [None]:
sentence='this film is great'
sent=sentence.split()
bigramF(sen)

['this', 'film', 'is', 'great', 'this film', 'film is', 'is great']

Setting the random seed

In [3]:
def setSeed(seed):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
seed=124
setSeed(seed)

Then we initialize the torch.text data Field object to build/tokenize & preprocess our texts and labels .Then we use IMDB dataset that is already available on torch.text and build the vocabulary of this data . The output is a dictionary with words indexed and its glove 100d pretrained embedding

In [4]:
TEXT=data.Field(tokenize='spacy',tokenizer_language='en_core_web_sm',preprocessing=bigramF)
LABEL=data.LabelField(dtype=torch.float)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)
# build the vocabulary

TEXT.build_vocab(train,unk_init=torch.normal,max_size=25000,vectors='glove.6B.100d')
LABEL.build_vocab(train)


aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 29.3MB/s]
.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 399981/400000 [00:13<00:00, 29057.85it/s]

We split traindata , and then load the iterator to loop through data in batches

In [7]:
traindata ,  valid = train.split(random_state=random.seed(seed))
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
trainLoader,validLoader,testLoader=data.BucketIterator.splits(datasets=(traindata,valid,test),batch_size=64,device=device)

#Build the model 

The model is implemented as in the paper , Embedding layer at first and then the output is averaged across the input dimension(which is vocabulary size ) using average pooling filter .Then a linear layer follows to return the final output

In [5]:
class FastText(nn.Module):
  def __init__(self,inputdim,embeddim,hiddim,outputdim,pad_idx):
      super().__init__()
      self.embedding=nn.Embedding(inputdim,embeddim,padding_idx=pad_idx)
      self.linear=nn.Linear(embeddim,outputdim)
      
  def forward(self,input):
    ##print('input ',input.shape)
    embed=self.embedding(input)
   
    embed=embed.permute(1,0,2)
    avg=F.avg_pool2d(embed,(embed.shape[1],1)).squeeze(1)
    #print('avg ',avg.shape)
    output=self.linear(avg)
    return output






#Train

Defining attributes of the model  and the train function

In [39]:
inputdim=len(TEXT.vocab)
embeddim=100
hiddim=10
outputdim=1
pad_idx=TEXT.vocab.stoi[TEXT.pad_token]
model=FastText(inputdim,embeddim,hiddim,outputdim,pad_idx)
model.to(device)
optimizor=torch.optim.Adam(model.parameters())
criterion=nn.BCEWithLogitsLoss()


In [40]:
def train(loader,model,criterion,optimizor):
  model.train()
  epoch_loss=0
  loss=0
  accuracy=0
  for batch in loader:
    optimizor.zero_grad()

    output=model(batch.text)
    
    loss=criterion(output.squeeze(1),batch.label)
    loss.backward()
    optimizor.step()
    epoch_loss+=loss.item()
    accuracy+=acc(output.squeeze(1),batch.label).item()
 
  return epoch_loss/len(loader),accuracy/len(loader)




In [41]:
def evaluate(loader,model,criterion):
  model.eval()
  epoch_loss,loss=0, 0
  accuracy=0
  for batch in loader:
    with torch.no_grad():
   
      output=model(batch.text)
      loss=criterion(output.squeeze(1),batch.label)
      epoch_loss+=loss.item()
      accuracy+=acc(output.squeeze(1),batch.label).item()
    
  return epoch_loss/len(loader) ,accuracy/len(loader)


#Binary Accuracy

A function to calculate sum of correct predicted classes 

In [47]:
def acc(predicted, groundtruth):
  
  predicted=torch.sigmoid(predicted)
  predicted [predicted>0.5]=1
  predicted[predicted<=0.5]=0
  
  return torch.sum(torch.eq(predicted.detach(),groundtruth))/len(predicted)

#Epoch time

In [18]:
import time 
def epochT(start, end):
  min=int((end-start)/60)
  sec=int((end-start) - min*60)
  return min, sec

Running the model for a number of epochs and output the best validation accuracy 

In [48]:
best_loss=float('inf')
for epoch in range(5):
  start=time.time()
  trainloss,train_acc=train(trainLoader,model,criterion,optimizor)
  validloss,val_acc=evaluate(validLoader,model,criterion)
  end=time.time()
  min,sec=epochT(start,end)
  print(f'train loss is  :{trainloss:.2f} and train_accuracy is {train_acc:.2f} trained on {min} minutes and {sec} seconds')
  print(f'validation loss is {validloss:.2f} and validation_accuracy is {val_acc:.2f} evaluated on {min} minutes and {sec} seconds ')
  if validloss<best_loss:
    best_loss=validloss
    torch.save(model.state_dict(),'SentimentModel.pt')



train loss is  :0.61 and train_accuracy is 0.77 trained on 0 minutes and 9 seconds
validation loss is 0.47 and validation_accuracy is 0.78 evaluated on 0 minutes and 9 seconds 
train loss is  :0.54 and train_accuracy is 0.82 trained on 0 minutes and 9 seconds
validation loss is 0.42 and validation_accuracy is 0.81 evaluated on 0 minutes and 9 seconds 
train loss is  :0.48 and train_accuracy is 0.85 trained on 0 minutes and 9 seconds
validation loss is 0.39 and validation_accuracy is 0.84 evaluated on 0 minutes and 9 seconds 
train loss is  :0.43 and train_accuracy is 0.87 trained on 0 minutes and 9 seconds
validation loss is 0.38 and validation_accuracy is 0.86 evaluated on 0 minutes and 9 seconds 
train loss is  :0.38 and train_accuracy is 0.89 trained on 0 minutes and 9 seconds
validation loss is 0.39 and validation_accuracy is 0.86 evaluated on 0 minutes and 9 seconds 


#Test accuracy

In [49]:
model.load_state_dict(torch.load('SentimentModel.pt'))
_,accurac=evaluate(testLoader,model,criterion)
print(f'Test accuracy is {accurac:.2f} ')

Test accuracy is 0.85 


#User input

classify a user input if positive or negative

In [61]:
lang=spacy.load('en_core_web_sm')
def predict(sentence,model):
  tokenized=[token.text for token in lang.tokenizer(sentence)]
  preprocessed=bigramF(tokenized)
 
  onehot=[TEXT.vocab.stoi[s] for s in preprocessed]
  tensor=torch.LongTensor(onehot).to(device)
  tensor=tensor.unsqueeze(1)
 
  output=torch.sum(torch.sigmoid(model(tensor)) > 0.5)
  return output.item()


In [64]:

input='this movie really good excellent '
model.load_state_dict(torch.load('SentimentModel.pt'))
LABEL.vocab.itos[predict(input ,model)]

'pos'