## Boilerplate

In [1]:
# %pip install numpy pandas matplotlib torch sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import torch.nn.functional as F

In [3]:
%matplotlib inline

## Functions to accomplish attention

In [4]:
def batch_matmul_bias(seq, weight, bias, nonlinearity=''):
    s = None
    bias_dim = bias.size()
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight) 
        _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1)
        if(nonlinearity=='tanh'):
            _s_bias = torch.tanh(_s_bias)
        _s_bias = _s_bias.unsqueeze(0)
        if(s is None):
            s = _s_bias
        else:
            s = torch.cat((s,_s_bias),0)
            
    return s.squeeze()

In [5]:
def batch_matmul(seq, weight, nonlinearity=''):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        if(nonlinearity=='tanh'):
            _s = torch.tanh(_s)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s), 0)
            
    return s.squeeze()

In [6]:
def attention_mul(rnn_outputs, att_weights):
    attn_vectors = None
    for i in range(rnn_outputs.size(0)):
        h_i = rnn_outputs[i]
        a_i = att_weights[i].unsqueeze(1).expand_as(h_i)
        h_i = a_i * h_i
        h_i = h_i.unsqueeze(0)
        if(attn_vectors is None):
            attn_vectors = h_i
        else:
            attn_vectors = torch.cat((attn_vectors,h_i),0)
    return torch.sum(attn_vectors, 0).unsqueeze(0)

## Word attention model with bias

In [7]:
class AttentionWordRNN(nn.Module):
    
    
    def __init__(self, batch_size, num_tokens, embed_size, word_gru_hidden, bidirectional= True):        
        
        super(AttentionWordRNN, self).__init__()
        
        self.batch_size = batch_size
        self.num_tokens = num_tokens
        self.embed_size = embed_size
        self.word_gru_hidden = word_gru_hidden
        self.bidirectional = bidirectional
        
        self.lookup = nn.Embedding(num_tokens, embed_size)
        if bidirectional == True:
            self.word_gru = nn.GRU(embed_size, word_gru_hidden, bidirectional= True)
            self.weight_W_word = nn.Parameter(torch.Tensor(2* word_gru_hidden,2*word_gru_hidden))
            self.bias_word = nn.Parameter(torch.Tensor(2* word_gru_hidden,1))
            self.weight_proj_word = nn.Parameter(torch.Tensor(2*word_gru_hidden, 1))
        else:
            self.word_gru = nn.GRU(embed_size, word_gru_hidden, bidirectional= False)
            self.weight_W_word = nn.Parameter(torch.Tensor(word_gru_hidden, word_gru_hidden))
            self.bias_word = nn.Parameter(torch.Tensor(word_gru_hidden,1))
            self.weight_proj_word = nn.Parameter(torch.Tensor(word_gru_hidden, 1))
            
        self.softmax_word = nn.Softmax()
        self.weight_W_word.data.uniform_(-0.1, 0.1)
        self.weight_proj_word.data.uniform_(-0.1,0.1)

        
        
    def forward(self, embed, state_word):
        # embeddings
        embedded = self.lookup(embed)
        # word level gru
        output_word, state_word = self.word_gru(embedded, state_word)
        word_squish = batch_matmul_bias(output_word, self.weight_W_word, self.bias_word, nonlinearity='tanh')
        word_squish = word_squish.reshape(
            output_word.shape[0], output_word.shape[1], self.weight_W_word.shape[1])

        word_attn = batch_matmul(word_squish, self.weight_proj_word)
        word_attn = word_attn.reshape(word_squish.shape[0], word_squish.shape[1])
        word_attn_norm = self.softmax_word(word_attn.transpose(1, 0))
        word_attn_vectors = attention_mul(output_word, word_attn_norm.transpose(1, 0))
        return word_attn_vectors, state_word, word_attn_norm
    
    def init_hidden(self):
        if self.bidirectional == True:
            return Variable(torch.zeros(2, self.batch_size, self.word_gru_hidden))
        else:
            return Variable(torch.zeros(1, self.batch_size, self.word_gru_hidden))        

## Sentence Attention model with bias

In [8]:
class AttentionSentRNN(nn.Module):
    
    
    def __init__(self, batch_size, sent_gru_hidden, word_gru_hidden, n_classes, bidirectional= True):        
        
        super(AttentionSentRNN, self).__init__()
        
        self.batch_size = batch_size
        self.sent_gru_hidden = sent_gru_hidden
        self.n_classes = n_classes
        self.word_gru_hidden = word_gru_hidden
        self.bidirectional = bidirectional
        
        
        if bidirectional == True:
            self.sent_gru = nn.GRU(2 * word_gru_hidden, sent_gru_hidden, bidirectional= True)        
            self.weight_W_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden ,2* sent_gru_hidden))
            self.bias_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden,1))
            self.weight_proj_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden, 1))
            self.final_linear = nn.Linear(2* sent_gru_hidden, n_classes)
        else:
            self.sent_gru = nn.GRU(word_gru_hidden, sent_gru_hidden, bidirectional= True)        
            self.weight_W_sent = nn.Parameter(torch.Tensor(sent_gru_hidden ,sent_gru_hidden))
            self.bias_sent = nn.Parameter(torch.Tensor(sent_gru_hidden,1))
            self.weight_proj_sent = nn.Parameter(torch.Tensor(sent_gru_hidden, 1))
            self.final_linear = nn.Linear(sent_gru_hidden, n_classes)
        self.softmax_sent = nn.Softmax()
        self.final_softmax = nn.Softmax()
        self.weight_W_sent.data.uniform_(-0.1, 0.1)
        self.weight_proj_sent.data.uniform_(-0.1,0.1)

        
        
    def forward(self, word_attention_vectors, state_sent):
        output_sent, state_sent = self.sent_gru(word_attention_vectors, state_sent)
        sent_squish = batch_matmul_bias(output_sent, self.weight_W_sent, self.bias_sent, nonlinearity='tanh')
        sent_squish = sent_squish.reshape(
            output_sent.shape[0], output_sent.shape[1], self.weight_W_sent.shape[1])
        
        sent_attn = batch_matmul(sent_squish, self.weight_proj_sent)
        sent_attn = sent_attn.reshape(sent_squish.shape[0], sent_squish.shape[1])
        sent_attn_norm = self.softmax_sent(sent_attn.transpose(1, 0))
        sent_attn_vectors = attention_mul(output_sent, sent_attn_norm.transpose(1, 0))
        final_map = self.final_linear(sent_attn_vectors.squeeze(0))
        return F.log_softmax(final_map), state_sent, sent_attn_norm
    
    def init_hidden(self):
        if self.bidirectional == True:
            return Variable(torch.zeros(2, self.batch_size, self.sent_gru_hidden))
        else:
            return Variable(torch.zeros(1, self.batch_size, self.sent_gru_hidden))   

## Functions to train the model

In [9]:
df = pd.read_csv('Pre-processed-comments-1650746640.csv')

from torchtext.vocab import build_vocab_from_iterator
from hazm import word_tokenize, sent_tokenize

vocab = build_vocab_from_iterator(df.text.apply(word_tokenize), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [10]:
word_attn = AttentionWordRNN(batch_size=64, num_tokens=len(vocab), embed_size=300, 
                             word_gru_hidden=100, bidirectional= True)

In [11]:
sent_attn = AttentionSentRNN(batch_size=64, sent_gru_hidden=100, word_gru_hidden=100, 
                             n_classes=2, bidirectional= True)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
def train_data(mini_batch, targets, word_attn_model, sent_attn_model, word_optimizer, sent_optimizer, criterion):
    state_word = word_attn_model.init_hidden().to(device)
    state_sent = sent_attn_model.init_hidden().to(device)
    max_sents, batch_size, max_tokens = mini_batch.size()
    word_optimizer.zero_grad()
    sent_optimizer.zero_grad()
    s = None
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(mini_batch[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    y_pred, state_sent, _ = sent_attn_model(s, state_sent)
    loss = criterion(y_pred.to(device), targets) 
    loss.backward()
    
    word_optimizer.step()
    sent_optimizer.step()
    
    return loss.item()

In [14]:
def get_predictions(val_tokens, word_attn_model, sent_attn_model):
    max_sents, batch_size, max_tokens = val_tokens.size()
    state_word = word_attn_model.init_hidden().to(device)
    state_sent = sent_attn_model.init_hidden().to(device)
    s = None
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(val_tokens[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    y_pred, state_sent, _ = sent_attn_model(s, state_sent)    
    return y_pred

In [15]:
learning_rate = 1e-1
momentum = 0.9

word_optmizer = torch.optim.SGD(word_attn.parameters(), lr=learning_rate, momentum= momentum)
sent_optimizer = torch.optim.SGD(sent_attn.parameters(), lr=learning_rate, momentum= momentum)

criterion = nn.CrossEntropyLoss()

In [16]:
word_attn.to(device), sent_attn.to(device)

(AttentionWordRNN(
   (lookup): Embedding(26029, 300)
   (word_gru): GRU(300, 100, bidirectional=True)
   (softmax_word): Softmax(dim=None)
 ),
 AttentionSentRNN(
   (sent_gru): GRU(200, 100, bidirectional=True)
   (final_linear): Linear(in_features=200, out_features=2, bias=True)
   (softmax_sent): Softmax(dim=None)
   (final_softmax): Softmax(dim=None)
 ))

## Loading the data

In [17]:
df = pd.read_csv('Pre-processed-comments-1650746640.csv')

from hazm import word_tokenize, sent_tokenize

def tokenize(text):
    try: return [vocab(word_tokenize(s)) for s in sent_tokenize(text)]
    except: return

df['tokens'] = df.text.apply(lambda x: tokenize(x))

df = df[~df.tokens.isna()]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.tokens.values, df.feeling.values, test_size = 0.3, random_state= 42)

y_train.shape

(87530,)

In [18]:
def pad_batch(mini_batch):
    
    mini_batch_size = len(mini_batch)
    max_sent_len = int(np.max([len(x) for x in mini_batch]))
    max_token_len = int(np.max([len(val) for sublist in mini_batch for val in sublist]))
    main_matrix = np.zeros((mini_batch_size, max_sent_len, max_token_len), dtype= np.int)
    
    for i in range(main_matrix.shape[0]):
        for j in range(main_matrix.shape[1]):
            for k in range(main_matrix.shape[2]):
                try:
                    main_matrix[i,j,k] = mini_batch[i][j][k]
                except IndexError:
                    pass
                
    return Variable(torch.from_numpy(main_matrix).transpose(0,1))

In [19]:
def test_accuracy_mini_batch(tokens, labels, word_attn, sent_attn):
    y_pred = get_predictions(tokens, word_attn, sent_attn)
    _, y_pred = torch.max(y_pred, 1)
    correct = np.ndarray.flatten(y_pred.data.cpu().numpy())
    labels = np.ndarray.flatten(labels.data.cpu().numpy())
    num_correct = sum(correct == labels)
    return float(num_correct) / len(correct)

In [20]:
def test_accuracy_full_batch(tokens, labels, mini_batch_size, word_attn, sent_attn):
    p = []
    l = []
    g = gen_minibatch(tokens, labels, mini_batch_size)
    for token, label in g:
        y_pred = get_predictions(token.to(device), word_attn, sent_attn)
        _, y_pred = torch.max(y_pred, 1)
        p.append(np.ndarray.flatten(y_pred.data.cpu().numpy()))
        l.append(np.ndarray.flatten(label.data.cpu().numpy()))
    p = [item for sublist in p for item in sublist]
    l = [item for sublist in l for item in sublist]
    p = np.array(p)
    l = np.array(l)
    num_correct = sum(p == l)
    return float(num_correct)/ len(p)

In [21]:
def test_data(mini_batch, targets, word_attn_model, sent_attn_model):    
    state_word = word_attn_model.init_hidden().to(device)
    state_sent = sent_attn_model.init_hidden().to(device)
    max_sents, batch_size, max_tokens = mini_batch.size()
    s = None
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(mini_batch[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    y_pred, state_sent,_ = sent_attn_model(s, state_sent)
    loss = criterion(y_pred.to(device), targets)     
    return loss.item()

In [22]:
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

In [23]:
def gen_minibatch(tokens, labels, mini_batch_size, shuffle= True):
    for token, label in iterate_minibatches(tokens, labels, mini_batch_size, shuffle= shuffle):
        token = pad_batch(token)
        yield token.to(device), Variable(torch.from_numpy(label), requires_grad= False).to(device)

In [24]:
def check_val_loss(val_tokens, val_labels, mini_batch_size, word_attn_model, sent_attn_model):
    val_loss = []
    for token, label in iterate_minibatches(val_tokens, val_labels, mini_batch_size, shuffle= True):
        val_loss.append(
            test_data(
                pad_batch(token).to(device), 
                Variable(torch.from_numpy(label), requires_grad= False).to(device), 
                word_attn_model, 
                sent_attn_model)
        )
    return np.mean(val_loss)

In [25]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

## Training

In [26]:
def train_early_stopping(mini_batch_size, X_train, y_train, X_test, y_test, word_attn_model, sent_attn_model, 
                         word_attn_optimiser, sent_attn_optimiser, loss_criterion, iterations, 
                         print_val_loss_every = 1000, print_loss_every = 50):
    start = time.time()
    
    loss_full = []
    loss_epoch = []
    loss_smooth = []
    accuracy_full = []
    accuracy_epoch = []
    epoch_counter = 0
    
    g = gen_minibatch(X_train, y_train, mini_batch_size)
    
    for i in range(1, iterations + 1):
        try:
            tokens, labels = next(g)
            loss = train_data(
                tokens, labels, word_attn_model, sent_attn_model, 
                word_attn_optimiser, sent_attn_optimiser, loss_criterion)
            
            loss_full.append(loss)
            loss_epoch.append(loss)
            
            acc = test_accuracy_mini_batch(tokens, labels, word_attn_model, sent_attn_model)
            
            accuracy_full.append(acc)
            accuracy_epoch.append(acc)
            
            # print loss every n passes
            if i % print_loss_every == 0:
                print('Loss at %d minibatches, %d epoch,(%s) is %f' %(i, epoch_counter, timeSince(start), np.mean(loss_epoch)))
                print('Accuracy at %d minibatches is %f' % (i, np.mean(accuracy_epoch)))
                
            # check validation loss every n passes
            if i % print_val_loss_every == 0:
                val_loss = check_val_loss(X_test, y_test, mini_batch_size, word_attn_model, sent_attn_model)
                print('Average training loss at this epoch..minibatch..%d..is %f' % (i, np.mean(loss_epoch)))
                print('Validation loss after %d passes is %f' %(i, val_loss))
                
                if val_loss > np.mean(loss_full):
                    print('Validation loss is higher than training loss at %d is %f , stopping training!' % (i, val_loss))
                    print('Average training loss at %d is %f' % (i, np.mean(loss_full)))
                    
        except StopIteration:
            epoch_counter += 1
            print('Reached %d epocs' % epoch_counter)
            print('i %d' % i)
            
            g = gen_minibatch(X_train, y_train, mini_batch_size)
            loss_epoch, accuracy_epoch = [], []

    return loss_full

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# from google.colab import files

loss_full = train_early_stopping(
    64, X_train, y_train, X_test, y_test, 
    word_attn, sent_attn, word_optmizer, sent_optimizer, criterion, 2700, 1000, 100)

import time
import torch

timestamp = int(time.time())

torch.save(word_attn.state_dict(), f'./word_attn-{timestamp}')
torch.save(sent_attn.state_dict(), f'./sent_attn-{timestamp}')

# files.download(f'./word_attn-{timestamp}')
# files.download(f'./sent_attn-{timestamp}')

Loss at 100 minibatches, 0 epoch,(1m 53s) is 0.878781
Accuracy at 100 minibatches is 0.817813
Loss at 200 minibatches, 0 epoch,(4m 24s) is 0.779223
Accuracy at 200 minibatches is 0.814609
Loss at 300 minibatches, 0 epoch,(6m 32s) is 0.753488
Accuracy at 300 minibatches is 0.816042
Loss at 400 minibatches, 0 epoch,(8m 58s) is 0.706916
Accuracy at 400 minibatches is 0.822695
Loss at 500 minibatches, 0 epoch,(11m 6s) is 0.654303
Accuracy at 500 minibatches is 0.830187
Loss at 600 minibatches, 0 epoch,(13m 29s) is 0.634232
Accuracy at 600 minibatches is 0.832031
Loss at 700 minibatches, 0 epoch,(15m 54s) is 0.636330
Accuracy at 700 minibatches is 0.828884
Loss at 800 minibatches, 0 epoch,(18m 20s) is 0.628280
Accuracy at 800 minibatches is 0.830020
Loss at 900 minibatches, 0 epoch,(20m 46s) is 0.626704
Accuracy at 900 minibatches is 0.828941


In [None]:
test_accuracy_full_batch(X_test, y_test, 64, word_attn, sent_attn)

In [None]:
test_accuracy_full_batch(X_train, y_train, 64, word_attn, sent_attn)

In [None]:
def predict(text, word_attn_model, sent_attn_model):
    
    tokens = [tokenize(text)]
    val_tokens = pad_batch(tokens)
    max_sents, batch_size, max_tokens = val_tokens.size()
    
    state_word = Variable(torch.zeros(2, 1, 100)).to(device)
    state_sent = Variable(torch.zeros(2, 1, 100)).to(device)
    
    s = None
    for i in range(max_sents):
        
        _s, state_word, _ = word_attn_model(
            val_tokens[i,:,:].transpose(0,1), state_word)
        
        if(s is None): s = _s
        else: s = torch.cat((s,_s),0) 
            
    y_pred, state_sent, _ = sent_attn_model(s, state_sent)  
    _, y_pred = torch.max(y_pred, 1)
    
    return {0.:'SAD', 1.:'HAPPY'}[y_pred.item()]

In [None]:
predict('واقعا غذای بدی بود', word_attn, sent_attn) 