In [None]:
# thx: https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/

In [None]:
#deal with tensors
import torch   
import torch.nn as nn
import torch.optim as optim

#handling text data
from torchtext import data, vocab

import random
import numpy as np
import spacy

from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
#Reproducing same results
SEED = 2020
N_EPOCHS = 50
ds_path = ''
output_path = ''

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [None]:
# python -m spacy download en

TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
fields = [('text',TEXT),('label', LABEL)]

In [None]:
#loading custom dataset
training_data=data.TabularDataset(path = ds_path,format = 'csv',fields = fields,skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

In [None]:
train_data, valid_data = training_data.split(split_ratio=0.8, random_state = random.seed(SEED))

In [None]:
# fullListHere : https://torchtext.readthedocs.io/en/latest/vocab.html
# glove = glove.6B.100d
# fasttext = fasttext.en.300d
#initialize embeddings
vec = vocab.Vectors('glove.6B.100d')

TEXT.build_vocab(train_data,min_freq=3,vectors=vec)
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
# print(TEXT.vocab.stoi) 

In [None]:
#check whether cuda is available
train_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Will train using: ' + train_device)
device = torch.device(train_device)  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [None]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 3
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

In [None]:
#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [None]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

In [None]:
#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    epoch_precision = 0
    epoch_recall = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   

        #round predictions to the closest integer
        predicted = torch.round(predictions).tolist()
        real = batch.label.tolist()

        #compute the f1_score
        f1score = f1_score(real, predicted, average="macro")

        #compute the precision
        precision = precision_score(real, predicted, average="macro")

        #compute the recall
        recall = recall_score(real, predicted, average="macro")
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()  
        epoch_f1 += f1score
        epoch_precision += precision
        epoch_recall += recall  
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator), epoch_precision / len(iterator), epoch_recall / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    epoch_precision = 0
    epoch_recall = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            #round predictions to the closest integer
            predicted = torch.round(predictions).tolist()
            real = batch.label.tolist()

            #compute the f1_score
            f1score = f1_score(real, predicted, average="macro")

            #compute the precision
            precision = precision_score(real, predicted, average="macro")

            #compute the recall
            recall = recall_score(real, predicted, average="macro")
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()  
            epoch_acc += acc.item()  
            epoch_f1 += f1score
            epoch_precision += precision
            epoch_recall += recall  
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator), epoch_precision / len(iterator), epoch_recall / len(iterator)

In [None]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [None]:
best_valid_loss = float('inf')

columns = 'Name,Epoch,Loss,Acc,F1,Precision,Recall'
f = open(output_path,'w')
f.write(columns)
f.write('\n')

printProgressBar(0, N_EPOCHS, prefix = 'Progress:', suffix = 'Complete', length = N_EPOCHS)
for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc, train_f1, train_precision, train_recall = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc, valid_f1, valid_precision, valid_recall = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    f.write(f'T-SNE_GLOVE_Train,{epoch},{train_loss*100:.2f},{train_acc*100:.2f},{train_f1*100:.2f},{train_precision*100:.2f},{train_recall*100:.2f}')
    f.write('\n')
    f.write(f'T-SNE_GLOVE_Test,{epoch},{valid_loss*100:.2f},{valid_acc*100:.2f},{valid_f1*100:.2f},{valid_precision*100:.2f},{valid_recall*100:.2f}')
    f.write('\n')
    
    printProgressBar(epoch + 1, N_EPOCHS, prefix = 'Progress:', suffix = 'Complete', length = N_EPOCHS)

f.close()