<a href="https://colab.research.google.com/github/AnudeepReddy-Katta/SentimentAnalysis/blob/main/notebooks/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
train = pd.read_csv('/content/drive/MyDrive/NLP/train_aug.csv', index_col=0)
test = pd.read_csv('/content/drive/MyDrive/NLP/test.csv', index_col=0)

In [3]:
train.head()

Unnamed: 0,sentence,sentiment_values
0,"In gleefully , thumpingly hyperbolic terms , i...",2
1,These people would n't know subtle characteriz...,0
2,"A grittily beautiful film that looks , sounds ...",3
3,No more .,1
4,Although ... visually striking and slickly sta...,1


In [4]:
test.head()

Unnamed: 0,sentence,sentiment_values
0,You might be shocked to discover that Seinfeld...,2
1,"For a film that celebrates radical , nonconfor...",2
2,Films are made of little moments .,2
3,The main problem being that it 's only a peek .,2
4,Even the imaginative gore ca n't hide the must...,1


In [5]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f1c53e53870>

In [6]:
Sentence = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [7]:
fields = [('sentences', Sentence),('labels',Label)]

In [8]:
example = [data.Example.fromlist([train.sentence[i],train.sentiment_values[i]], fields) for i in range(train.shape[0])] 

In [9]:
train_data = data.Dataset(example, fields)

In [10]:
example_test = [data.Example.fromlist([test.sentence[i],test.sentiment_values[i]], fields) for i in range(test.shape[0])] 

In [11]:
valid_data = data.Dataset(example_test, fields)

In [12]:
(len(train_data), len(valid_data))

(12483, 2371)

In [13]:
vars(valid_data.examples[50])

{'labels': 3,
 'sentences': ['Watching',
  'Austin',
  'Powers',
  'in',
  'Goldmember',
  'is',
  'like',
  'binging',
  'on',
  'cotton',
  'candy',
  '.']}

In [14]:
Sentence.build_vocab(train_data)
Label.build_vocab(train_data)

In [15]:
print('Size of input vocab : ', len(Sentence.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Sentence.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  18141
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 11147), (',', 9917), ('the', 8363), ('and', 6119), ('of', 6047), ('a', 5985), ('to', 4226), ('-', 3796), ("'s", 3497), ('is', 3463)]
Labels :  defaultdict(None, {1: 0, 0: 1, 2: 2, 3: 3, 4: 4})


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size = 64, 
                                                            sort_key = lambda x: len(x.sentences),
                                                            sort_within_batch=True, 
                                                            device = device)

In [18]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Sentence.vocab.stoi, tokens)

In [19]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True,
                           bidirectional=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [20]:
# Define hyperparameters
size_of_vocab = len(Sentence.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [21]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(18141, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
The model has 6,006,005 trainable parameters


In [22]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# define metric
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [23]:

def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        sent, sent_lengths = batch.sentences   
        
        # convert to 1D tensor
        predictions = model(sent, sent_lengths).squeeze()  
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = categorical_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            sent, sent_lengths = batch.sentences
            
            # convert to 1d tensor
            predictions = model(sent, sent_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = categorical_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.588 | Train Acc: 26.64%
	 Val. Loss: 1.567 |  Val. Acc: 30.89% 

	Train Loss: 1.473 | Train Acc: 42.43%
	 Val. Loss: 1.553 |  Val. Acc: 31.22% 

	Train Loss: 1.335 | Train Acc: 57.54%
	 Val. Loss: 1.549 |  Val. Acc: 32.46% 

	Train Loss: 1.216 | Train Acc: 70.45%
	 Val. Loss: 1.528 |  Val. Acc: 35.18% 

	Train Loss: 1.132 | Train Acc: 78.43%
	 Val. Loss: 1.540 |  Val. Acc: 33.72% 



In [26]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_sentence(sentence):
    
    categories = {0: "very negative", 1:"negative", 2:"neutral", 3:"positive", 4:"very positive"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    pred = prediction.argmax(1)
    
    return categories[pred.item()]

In [27]:
classify_sentence("The notion that bombing buildings is the funniest thing in the world goes entirely unexamined in this startlingly unfunny comedy .")

'negative'

In [28]:
classify_sentence("Visually rather stunning , but ultimately a handsome-looking bore , the true creativity would have been to hide Treasure Planet entirely and completely reimagine it .")

'very negative'

In [29]:

classify_sentence("Miller is playing so free with emotions , and the fact that children are hostages to fortune , that he makes the audience hostage to his swaggering affectation of seriousness .")

'positive'

In [30]:
classify_sentence("Schaeffer has to find some hook on which to hang his persistently useless movies , and it might as well be the resuscitation of the middle-aged character .")

'very negative'

In [31]:
classify_sentence("There 's just no currency in deriding James Bond for being a clichÃ©d , doddering , misogynistic boy 's club .")

'neutral'

In [32]:
classify_sentence("When the film ended , I felt tired and drained and wanted to lie on my own deathbed for a while .")

'negative'

In [33]:
classify_sentence("Full of witless jokes , dealing in broad stereotypes and outrageously unbelievable scenarios , and saddled with a general air of misogyny")

'negative'

In [34]:
classify_sentence("The film 's hackneyed message is not helped by the thin characterizations , nonexistent plot and pretentious visual style .")

'negative'

In [35]:
classify_sentence("Detox is ultimately a pointless endeavor .")

'very negative'

In [36]:
classify_sentence("Too much of the humor falls flat .")

'very negative'

In [37]:
classify_sentence("No way I can believe this load of junk .")

'negative'