# Sentiment Analysis with twitter API

In this work, we want to find if the tag in twitter (e.g. "iphone) is positive or negative by using
sentiment analysis on the post related to tag. Then use our model to determine if it contains positive or negative sentiment.

## Data preparation

Just loading and do some indexing. Nothing much.

In [1]:
# Import necessary libraries.
import torch
from torch import nn
import time

# Check if we can use CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Use for reproducability
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


In [4]:
import pytreebank
dataset = pytreebank.load_sst()

In [5]:
def unpack(data):
    temp_data  = list()
    temp_label = list()
    lenght = len(list(iter(data)))
    for i in range(lenght):
        for label, sentence in data[i].to_labeled_lines():
            temp_data.append(sentence)
            temp_label.append(label)
    
    return (temp_data, temp_label)

In [6]:
# We do not want to use the treebank structure.
# Seperated into train, dev and test set
train_data, train_label = unpack(dataset["train"])
valid_data, valid_label = unpack(dataset["dev"])
test_data,  test_label  = unpack(dataset["test"])

# Check the lenght of each data set
train_size = len(train_data)
valid_size = len(valid_data)
test_size =  len(test_data)

# This is the same as train_data above, it is just in the format of tree.
# I do this for the sake of convenience when we trying to build dataloader.
# But IT IS NOT A GOOD PRACTICE!
train = dataset["train"]
valid = dataset["dev"]
test  = dataset["test"]

t_train_size = len(list(iter(train)))
t_valid_size = len(list(iter(valid)))
t_test_size = len(list(iter(test)))

print(t_train_size, t_valid_size, t_test_size)
print(train_size, valid_size, test_size)
print(len(train_label), len(valid_label), len(test_label))

8544 1101 2210
318582 41447 82600
318582 41447 82600


In [7]:
# Let's take a look
train_data[0], train_label[0]

("The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .",
 3)

In [8]:
# Tokenize
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_md')

In [43]:
#Numericalization

from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):  #data_iter = train, test, validation
    for data in data_iter:  # Look for the tree
        for _, text in data.to_labeled_lines(): # Get the data inside tree
            yield tokenizer(text)
        
vocab = build_vocab_from_iterator(yield_tokens(train), specials=['<unk>', '<pad>',
                                                                 '<bos>', '<eos>'])

In [44]:
vocab.set_default_index(vocab["<unk>"])

# Check if our vocab is working.
print(vocab(['Chaky', 'wants', 'his', 'student', 'to', 'be', 'number', '1', '.']))
print(vocab(['<pad>','<bos>','<eos>']))
id2word = vocab.get_itos()
id2word[0]

len(vocab)

[0, 919, 36, 2733, 9, 28, 908, 3233, 10]
[1, 2, 3]


17136

## Prepare Embedding

In [15]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

In [16]:
# Now that we get the vectors, it's time to create embedding.
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [17]:
# Let check the shape
fast_embedding.shape

torch.Size([17136, 300])

## Prepare Dataloader

In [18]:
text_pipeline  = lambda x: vocab(tokenizer(x))
#label_pipeline = lambda x: int(x) - 1  #1, 2, 3, 4 ---> 0, 1, 2, 3 #

In [19]:
# Testing text_pipeline
text_pipeline("I love to play football")

[63, 110, 9, 494, 8735]

In [20]:
t = torch.empty(3, 4, 5)
t.size()
torch.Size([3, 4, 5])
t.size(0)

3

In [21]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence #making each batch same length

pad_ix = vocab['<pad>']

#this function gonna be called by DataLoader
def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        length_list.append(processed_text.size(0)) #for padding, this keep the lenght of sequence.
        
    return torch.tensor(label_list, dtype=torch.int64), \
        pad_sequence(text_list, padding_value=pad_ix, batch_first=True), \
        torch.tensor(length_list, dtype=torch.int64)  # The pad_seq functions automatically do the work.

In [22]:
# We need the data in the format of tuples .. e.g. (label, text)

def merge(list1, list2):
    merged_list = [(list1[i], list2[i]) for i in range(0, len(list1))]
    return merged_list


training_data   = merge(train_label, train_data)
validation_data = merge(valid_label, valid_data)
testing_data    = merge(test_label,  test_data)

In [23]:
# Test the one that we created.
training_data[0]

(3,
 "The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .")

In [24]:
# The one that we already have.
# Exactly the same!
train_label[0], train_data[0]

(3,
 "The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .")

In [25]:
batch_size = 64

train_loader = DataLoader(training_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

val_loader   = DataLoader(validation_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

test_loader  = DataLoader(testing_data, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

In [26]:
for label, text, length in train_loader:
    break
print("Label shape: ", label.shape) # (batch_size, )
print("Text shape: ", text.shape)   # (batch_size, seq len)

Label shape:  torch.Size([64])
Text shape:  torch.Size([64, 26])


## Prepare Model
Basically in this part, we will just define LSTM neural network and function for training.

In [27]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout):
        super().__init__()
        #put padding_idx so asking the embedding layer to ignore padding
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_ix)
        self.lstm = nn.LSTM(emb_dim, 
                           hid_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hid_dim * 2, output_dim)
        
    def forward(self, text, text_lengths):
        #text = [batch size, seq len]
        embedded = self.embedding(text)
        
        #++ pack sequence ++
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False, batch_first=True)
        
        #embedded = [batch size, seq len, embed dim]
        packed_output, (hn, cn) = self.lstm(packed_embedded)  #if no h0, all zeroes
        
        #++ unpack in case we need to use it ++
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        #output = [batch size, seq len, hidden dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
        #hn = [batch size, hidden dim * num directions]
        
        return self.fc(hn)

In [28]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [29]:
input_dim  = len(vocab)
hid_dim    = 256
emb_dim    = 300         # Why 300, we do not know depend on you.
output_dim = 5 # [0, 1, 2, 3, 4] # We have 5 class

#for biLSTM
num_layers = 2
bidirectional = True
dropout = 0.5

model = LSTM(input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout).to(device)
model.apply(initialize_weights)
model.embedding.weight.data = fast_embedding #**<------applied the fast text embedding as the initial weights

In [30]:
#we can print the complexity by the number of parameters
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    for item in params:
        print(f'{item:>6}')
    print(f'______\n{sum(params):>6}')
    
count_parameters(model)

5140800
307200
262144
  1024
  1024
307200
262144
  1024
  1024
524288
262144
  1024
  1024
524288
262144
  1024
  1024
  2560
     5
______
7863109


In [31]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #combine softmax with cross entropy

In [32]:
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [33]:
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text, text_length) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text, text_length).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [34]:
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text, text_length).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [35]:
# Function to calculate time.
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Training!

In [36]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))

In [37]:
best_valid_loss = float('inf')
num_epochs      = 8
tolerance_counter = 0

save_path = f'/root/projects/NLP/Assignment/8_Feb_Sentiment_Analysis/weights/{model.__class__.__name__}.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
    
    start_time = time.time()

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
    valid_loss, valid_acc = evaluate(model, val_loader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accs.append(valid_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        tolerance_counter = 0
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')   
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    # Tolerance techniques, stop the model if it start to overfit.
    if tolerance_counter >= 3:
        break

    tolerance_counter = tolerance_counter + 1

KeyboardInterrupt: 

In [61]:
def predict(text, text_length):
    with torch.no_grad():
        output = model(text, text_length).squeeze(1)
        predicted = torch.max(output.data, 1)[1]
        return predicted

In [66]:
def sentence_checking(test_list):
    predict_list = list()
    for sent in test_list:
        text = torch.tensor(text_pipeline(sent)).to(device)
        text_list = [x.item() for x in text]
        text = text.reshape(1, -1)
        text_length = torch.tensor([text.size(1)]).to(dtype=torch.int64)
        predict_list.append(predict(text, text_length))
    return predict_list

In [73]:
#["very negative", "negative", "neutral", "positive", "very positive"]
test_case = ['The movie should have been good', # Negative
    'What is not to like about this product.', # Negative
    "The price is not so bad", # Positive
    'This software is not buggy'] # Positive

print(sentence_checking(test_case))

[tensor([2], device='cuda:0'), tensor([1], device='cuda:0'), tensor([2], device='cuda:0'), tensor([2], device='cuda:0')]
