<a href="https://colab.research.google.com/github/AmbiTyga/Task-Stylumia/blob/Basic/Model-Training_and_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/AmbiTyga/Task-Stylumia/Basic/TrainTest.7z

--2021-02-28 11:11:33--  https://raw.githubusercontent.com/AmbiTyga/Task-Stylumia/Basic/TrainTest.7z
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9147130 (8.7M) [application/octet-stream]
Saving to: ‘TrainTest.7z’


2021-02-28 11:12:37 (30.4 MB/s) - ‘TrainTest.7z’ saved [9147130/9147130]



In [2]:
!7z x TrainTest.7z


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,4 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 9147130 bytes (8933 KiB)

Extracting archive: TrainTest.7z
--
Path = TrainTest.7z
Type = 7z
Physical Size = 9147130
Headers Size = 194
Method = LZMA2:24
Solid = +
Blocks = 1

  0%     26% 1 - train_parsed.json                           65% 1 - train_parsed.json                          100% 2      Everything is Ok

Files: 2
Size:       109002701
Compressed: 9147130


In [12]:
import pandas as pd
import numpy as np
import torch 
from torchtext import data
from torchtext.vocab import GloVe
import torch.nn as nn
import re

In [4]:
train = pd.read_json('/content/train_parsed.json')
test = pd.read_json('/content/test_parsed.json')

In [5]:
def padding(text,attr = 'title'):
  max_len = 39 if attr=='title' else 282
  sent_len = len(text.split())
  if sent_len>max_len:
    return " ".join(text.split()[:max_len])
  else:
    text = " ".join(text.split()+['<pad>']*(max_len-sent_len))
    return text

In [6]:
train.drop(columns=['tld_with_tok', 'ac_tok','title_tok', 'body_tok', 'text', 'raw_text'],inplace = True)
test.drop(columns=['tld_with_tok', 'ac_tok','title_tok', 'body_tok', 'text', 'raw_text'],inplace = True)

In [7]:
train['text'] = '<tld> '+ train['tld']+ ' <ac> ' + train['alchemy_category'] + ' <title> ' + train['title'].apply(padding)+ ' <body> ' + train['body'].apply(padding,attr='body') 
test['text'] = '<tld> '+ test['tld']+ ' <ac> ' + test['alchemy_category'] + ' <title> ' + test['title'].apply(padding)+ ' <body> ' + test['body'].apply(padding,attr='body') 

In [8]:
from sklearn.model_selection import train_test_split
Train,Val = train_test_split(train,random_state = 2021,test_size = 0.2,stratify = train['label'])

In [20]:
train.select_dtypes('object').columns

Index(['tld', 'alchemy_category', 'title', 'body', 'text'], dtype='object')

In [9]:
Train.to_csv('train.csv',index= False)
Val.to_csv('val.csv',index= False)

In [45]:
TEXT =  data.Field()
fields = [(None,None) if x !='text' else ('text',TEXT) for x in train.columns]
LABEL = data.LabelField()
fields[26] = ('label',LABEL)

In [46]:
train_data, valid_data = data.TabularDataset.splits(
                            path = '/content',
                            train = 'train.csv',
                            validation = 'val.csv',
                            format = 'csv',
                            fields = fields)

In [47]:
TEXT.build_vocab(train_data,vectors = GloVe())
TEXT.build_vocab(valid_data,vectors = GloVe())
LABEL.build_vocab(train_data)

In [48]:
device = torch.device('cuda')

BATCH_SIZE = 32

train_iterator, valid_iterator = data.Iterator.splits(
    (train_data, valid_data),
    sort = False,
    shuffle = False,
    batch_size=BATCH_SIZE,
    device=device)

In [51]:
class NumericalDataset:
  def __init__(self,data,train=True):
    self.features = ['alchemy_category_score', 'alchemy_labels', 'avglinksize',
       'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3',
       'commonlinkratio_4', 'compression_ratio', 'embed_ratio',
       'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio',
       'is_news', 'lengthyLinkDomain', 'linkwordscore',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio']
    self.data = data
    self.flag = train

  def __getitem__(self,i):
    X = self.data.loc[i,self.features].astype(float).values
    y = self.data.loc[i,'label']
    return torch.DoubleTensor(X),torch.LongTensor([y])
  
  def __len__(self):
    return len(self.data)

In [52]:
linear_data = NumericalDataset(train)
linear_data_loader = torch.utils.data.DataLoader(linear_data,batch_size=32)

In [60]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np

class AttentionModel(torch.nn.Module):
  def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
    super(AttentionModel, self).__init__()

    """
    Arguments
    ---------
    batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
    output_size : 2 = (pos, neg)
    hidden_sie : Size of the hidden_state of the LSTM
    vocab_size : Size of the vocabulary containing unique words
    embedding_length : Embeddding dimension of GloVe word embeddings
    weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 

    --------

    """

    self.batch_size = batch_size
    self.output_size = output_size
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.embedding_length = embedding_length

    self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
    self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
    self.lstm = nn.LSTM(embedding_length, hidden_size)
    self.label = nn.Linear(hidden_size, output_size)
    #self.attn_fc_layer = nn.Linear()
    
  def attention_net(self, lstm_output, final_state):

    """ 
    Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention to compute soft alignment score corresponding
    between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm for the batch matrix multiplication.

    Arguments
    ---------

    lstm_output : Final output of the LSTM which contains hidden layer outputs for each sequence.
    final_state : Final time-step hidden state (h_n) of the LSTM

    ---------

    Returns : It performs attention mechanism by first computing weights for each of the sequence present in lstm_output and and then finally computing the
          new hidden state.
          
    Tensor Size :
          hidden.size() = (batch_size, hidden_size)
          attn_weights.size() = (batch_size, num_seq)
          soft_attn_weights.size() = (batch_size, num_seq)
          new_hidden_state.size() = (batch_size, hidden_size)
            
    """

    hidden = final_state.squeeze(0)
    attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
    soft_attn_weights = F.softmax(attn_weights, 1)
    new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)

    return new_hidden_state

  def forward(self, input_sentences, batch_size=None):

    """ 
    Parameters
    ----------
    input_sentence: input_sentence of shape = (batch_size, num_sequences)
    batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

    Returns
    -------
    Output of the linear layer containing logits for pos & neg class which receives its input as the new_hidden_state which is basically the output of the Attention network.
    final_output.shape = (batch_size, output_size)

    """

    input = self.word_embeddings(input_sentences)
    print(input.shape)
    input = input.permute(1, 0, 2)
    if batch_size is None:
      h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
      c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
    else:
      h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
      c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
      
    output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) # final_hidden_state.size() = (1, batch_size, hidden_size) 
    output = output.permute(1, 0, 2) # output.size() = (batch_size, num_seq, hidden_size)

    attn_output = self.attention_net(output, final_hidden_state)
    logits = self.label(attn_output)

    return logits

In [61]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
    
def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)
	

learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300
word_embeddings = TEXT.vocab.vectors
vocab_size = len(TEXT.vocab)

model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(10):
    train_loss, train_acc = train_model(model, train_iterator, epoch)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, val_iterator)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')


torch.Size([32, 300])


RuntimeError: ignored