In [16]:
import pandas as pd

#@title Default title text
def get_sentence_partitions():
    sentences = pd.read_csv('/content/datasetSentences.txt', index_col="sentence_index",
                                sep="\t")
    splits = pd.read_csv('/content/datasetSplit.txt', index_col="sentence_index")
    return sentences.join(splits).set_index("sentence")


In [17]:
#@title Default title text
def get_phrase_sentiments():
    dictionary = pd.read_csv('/content/dictionary.txt', sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")

    sentiment_labels = pd.read_csv('/content/sentiment_labels.txt', sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")

    phrase_sentiments = dictionary.join(sentiment_labels)

    phrase_sentiments["new_labels"] = pd.cut(phrase_sentiments.sentiment, [0,0.2,0.4,0.6,0.8,1.0],
                                           precision = 6, 
                                           include_lowest=True,
                                           labels=["1","2","3","4","5"])
    #phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


In [18]:
def partition():
    phrase_sentiments = get_phrase_sentiments()
    sentence_partitions = get_sentence_partitions()
    # noinspection PyUnresolvedReferences
    data = phrase_sentiments.join(sentence_partitions, on="phrase")
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    data["phrase"] = data["phrase"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")


In [19]:
for splitset, partition in partition():
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = ("/content/stanford-sentiment-treebank.%s.csv" % split_name)
    del partition["splitset_label"]
    partition.to_csv(filename,sep='|')

In [20]:
import pandas as pd

In [21]:
train = pd.read_csv("/content/stanford-sentiment-treebank.train.csv",sep='|')

In [22]:
valid = pd.read_csv("/content/stanford-sentiment-treebank.dev.csv",sep='|')

In [23]:
import random
import torch, torchtext
from torchtext import data

In [24]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fd93c6c0630>

In [25]:
Review = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [26]:
fields = [('review', Review), ('label', Label)]

In [None]:
example = [torchtext.legacy.data.Example.fromlist([train.phrase[i],train.new_labels[i]], fields) for i in range(train.shape[0])] 

In [None]:
example_2 = [torchtext.legacy.data.Example.fromlist([valid.phrase[i],valid.new_labels[i]], fields) for i in range(valid.shape[0])] 

In [15]:
import os, pickle
with open('/content/train_data.pkl', 'wb') as tokens: 
    pickle.dump(example,tokens)
with open('/content/valid_data.pkl', 'wb') as tokens: 
    pickle.dump(example_2,tokens)

UnpicklingError: ignored

In [None]:
Stanford_dataset_train = torchtext.legacy.data.Dataset(example, fields)
Stanford_dataset_valid = torchtext.legacy.data.Dataset(example_2, fields)

In [None]:
len(Stanford_dataset_train), len(Stanford_dataset_valid)

(236077, 1044)

In [None]:
vars(Stanford_dataset_train.examples[11])

{'label': 10, 'review': ['!', 'Run']}

In [None]:
Review.build_vocab(Stanford_dataset_train)
Label.build_vocab(Stanford_dataset_train)

In [None]:
print('Size of input vocab : ', len(Review.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Review.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  20835
Size of label vocab :  25
Top 10 words appreared repeatedly : [('the', 74256), (',', 67908), ('a', 53131), ('of', 50686), ('and', 50326), ('to', 36096), ('.', 35187), ('-', 35182), ("'s", 27266), ('in', 21919)]
Labels :  defaultdict(None, {12: 0, 13: 1, 11: 2, 15: 3, 14: 4, 9: 5, 10: 6, 18: 7, 17: 8, 16: 9, 19: 10, 7: 11, 6: 12, 8: 13, 20: 14, 5: 15, 4: 16, 21: 17, 3: 18, 22: 19, 2: 20, 23: 21, 1: 22, 0: 23, 24: 24})


In [None]:
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
train_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits((Stanford_dataset_train, Stanford_dataset_valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.review),
                                                            sort_within_batch=True, device = device)

In [None]:
next(iter(train_iterator))
#len(train.examples[11].tweet)


[torchtext.legacy.data.batch.Batch of size 32]
	[.review]:('[torch.LongTensor of size 32x8]', '[torch.LongTensor of size 32]')
	[.label]:[torch.LongTensor of size 32]

In [None]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Review.vocab.stoi, tokens)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [None]:
# Define hyperparameters
size_of_vocab = len(Review.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 25
num_layers = 2
dropout = 0.2

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [None]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(20835, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=100, out_features=25, bias=True)
)
The model has 6,494,625 trainable parameters


In [None]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.review  
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.review
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 3.059 | Train Acc: 23.68%
	 Val. Loss: 3.252 |  Val. Acc: 3.31% 

	Train Loss: 3.044 | Train Acc: 24.09%
	 Val. Loss: 3.252 |  Val. Acc: 3.31% 

	Train Loss: 3.044 | Train Acc: 24.10%
	 Val. Loss: 3.252 |  Val. Acc: 3.31% 

	Train Loss: 3.044 | Train Acc: 24.10%
	 Val. Loss: 3.252 |  Val. Acc: 3.31% 



In [None]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]