In [235]:
import pandas as pd

df = pd.read_csv("/content/tweets.csv")
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


# New Section

In [236]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [237]:
import random
import torch, torchtext
from torchtext import data

In [238]:
# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f0a0f93dd30>

In [239]:
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [240]:
fields = [('tweet', Tweet), ('label', Label)]

In [241]:
example = [torchtext.legacy.data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])]

In [242]:
twitterDataset = torchtext.legacy.data.Dataset(example, fields)

In [243]:
(train, valid) = twitterDataset.split(split_ratio=[85, 15], random_state = random.seed(SEED))

In [244]:
len(train), len(valid)

(1159, 205)

In [245]:
vars(train.examples[10])

{'label': 0,
 'tweet': ['Obama',
  ',',
  'Romney',
  'agree',
  ':',
  'Admit',
  'women',
  'to',
  'Augusta',
  'golf',
  'club',
  ':',
  'US',
  'President',
  'Barack',
  'Obama',
  'believes',
  'women',
  'should',
  'be',
  'allowe',
  '...',
  'http://t.co/PVKrepqI']}

In [246]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [247]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4651
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1069), (':', 783), ('#', 780), ('.', 761), (',', 598), ('"', 550), ('the', 542), ('RT', 516), ('?', 419), ('to', 400)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [248]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [249]:
train_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweet),
                                                            sort_within_batch=True, device = device)

In [250]:
next(iter(train_iterator))
#len(train.examples[10].tweet)


[torchtext.legacy.data.batch.Batch of size 32]
	[.tweet]:('[torch.LongTensor of size 32x8]', '[torch.LongTensor of size 32]')
	[.label]:[torch.LongTensor of size 32]

In [251]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [252]:
import torch.nn as nn
import torch.nn.functional as F
class Encoder(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           #dropout=dropout,
                           batch_first=True)


    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        # pass to encoder
        packed_output, (hidden_encoder, cell_encoder) = self.encoder(packed_embedded)

        # unpack sequence
        encoder_output, encoder_output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
    
        # encoder o/p for decoder ; embedded for visualization purpose
        return encoder_output, embedded

In [253]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
embedding_dim = 300 
num_hidden_nodes = 100
num_layers = 1

# Instantiate the Encoder
encoder = Encoder(size_of_vocab, embedding_dim, num_hidden_nodes, num_layers)

In [254]:
print(encoder)

#No. of trianable parameters
def count_parameters(encoder):
    return sum(p.numel() for p in encoder.parameters() if p.requires_grad)
    
print(f'The encoder has {count_parameters(encoder):,} trainable parameters')

Encoder(
  (embedding): Embedding(4651, 300)
  (encoder): LSTM(300, 100, batch_first=True)
)
The encoder has 1,556,100 trainable parameters


In [255]:
class Decoder(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, encoder_output_dim, hidden_dim, output_dim, n_layers):
        
        super().__init__()          
        
        # LSTM layer
        self.decoder = nn.LSTM(encoder_output_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           batch_first=True)

        self.fc= nn.Linear(hidden_dim, output_dim)


    def forward(self, encoder_output):
        
        # pass to decoder
        output, (hidden_decoder, cell_decoder) = self.decoder(encoder_output)

        # Linear
        dense_outputs = self.fc(hidden_decoder)   

        final_output = F.softmax(dense_outputs[0], dim=1)

        # final_output is prediction & output is decoder output for visualization purpose
        return final_output, output

In [256]:
# Define hyperparameters
size_of_vocab = len(Tweet.vocab)
encoder_output_dim = 100
n_hidden_nodes = 70
num_output_nodes = 3
num_layers = 1

# Instantiate the Decoder
decoder = Decoder(size_of_vocab, encoder_output_dim, n_hidden_nodes, num_output_nodes, num_layers)

In [257]:
print(decoder)

#No. of trianable parameters
def count_parameters(decoder):
    return sum(p.numel() for p in decoder.parameters() if p.requires_grad)
    
print(f'The decoder has {count_parameters(decoder):,} trainable parameters')

Decoder(
  (decoder): LSTM(100, 70, batch_first=True)
  (fc): Linear(in_features=70, out_features=3, bias=True)
)
The decoder has 48,373 trainable parameters


In [258]:
import torch.optim as optim

# define optimizer and loss
encoder_optimizer = optim.Adam(encoder.parameters(), lr=2e-4)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
encoder = encoder.to(device)
decoder = decoder.to(device)
criterion = criterion.to(device)

In [259]:
def train(encoder, decoder, iterator, encoder_optimizer, decoder_optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    encoder.train()  
    decoder.train()
    
    for batch in iterator:
        
        # resets the gradients after every batch
        encoder_optimizer.zero_grad()   
        decoder_optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweet  
        
        # convert to 1D tensor
        encoder_out, embedding = encoder(tweet, tweet_lengths)
        
        predictions, decoder_out = decoder(encoder_out)
        predictions = predictions.squeeze() 

        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [260]:
def evaluate(encoder, decoder, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    encoder.eval()
    decoder.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweet
            
            # convert to 1d tensor
            encoder_out, embedding = encoder(tweet, tweet_lengths)
        
            predictions , decoder_out = decoder(encoder_out)
            predictions = predictions.squeeze() 
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [261]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(encoder, decoder, train_iterator, encoder_optimizer, decoder_optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(encoder, decoder, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(encoder.state_dict(), 'encoder_saved_weights.pt')
        torch.save(decoder.state_dict(), 'decoder_saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')
    
    trainLoss.append(train_loss)
    validLoss.append(valid_loss)

    trainAcc.append(train_acc)
    validAcc.append(valid_acc)

	Train Loss: 1.071 | Train Acc: 64.73%
	 Val. Loss: 1.045 |  Val. Acc: 68.30% 

	Train Loss: 0.975 | Train Acc: 69.12%
	 Val. Loss: 0.900 |  Val. Acc: 68.30% 

	Train Loss: 0.874 | Train Acc: 69.12%
	 Val. Loss: 0.873 |  Val. Acc: 68.30% 

	Train Loss: 0.864 | Train Acc: 69.12%
	 Val. Loss: 0.866 |  Val. Acc: 68.75% 

	Train Loss: 0.855 | Train Acc: 69.72%
	 Val. Loss: 0.854 |  Val. Acc: 68.75% 

	Train Loss: 0.828 | Train Acc: 72.08%
	 Val. Loss: 0.844 |  Val. Acc: 71.43% 

	Train Loss: 0.799 | Train Acc: 77.96%
	 Val. Loss: 0.835 |  Val. Acc: 72.77% 

	Train Loss: 0.769 | Train Acc: 80.66%
	 Val. Loss: 0.828 |  Val. Acc: 74.11% 

	Train Loss: 0.729 | Train Acc: 85.56%
	 Val. Loss: 0.824 |  Val. Acc: 74.55% 

	Train Loss: 0.701 | Train Acc: 87.42%
	 Val. Loss: 0.810 |  Val. Acc: 76.34% 



In [262]:
#load weights and tokenizer

en_path='./encoder_saved_weights.pt'
encoder.load_state_dict(torch.load(en_path));
encoder.eval();

de_path='./decoder_saved_weights.pt'
decoder.load_state_dict(torch.load(de_path));
decoder.eval();

tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)
#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    encoder_out, embedding = encoder(tensor, length_tensor)
    prediction, decoder_out = decoder(encoder_out)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [263]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

'Positive'