In [1]:
from time import asctime
import pandas as pd
import numpy as np
import logging
import string
import nltk
from  collections import Counter
from sklearn.model_selection import train_test_split
import random

import torch 
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader , TensorDataset

# preprocessingfile = 'preprocessing.log'
datafile = 'Train.txt'
# logging.basicConfig(filename = preprocessingfile , format='%(asctime)s %(message)s',filemode='w')
# logger = logging.getLogger()
input , output = [], []
# logger.setLevel(logging.DEBUG)
with open(datafile) as file:
    for line in file.readlines():
        text = line.split(',')
        input.append(text[0][1:-1])
        output.append(text[1].strip()[1:-1])

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
test_input , test_output = [], []
test_datafile = '/data1/home/armangupta/Semester 3/DLNLP/Assignment_4_A/Assignment4aTestDataset.txt'

with open(test_datafile) as file:
    for line in file.readlines():
        text = line.split(',')
        test_input.append(text[0][1:-1])
        test_output.append(text[1].strip()[1:-1])

In [4]:
special = set()
for inp in input:
    for i in inp:
        if not i.isalnum():
            special.add(i)
special

{' ', '/'}

In [5]:
test_input

['august 14 1733',
 'monday 24 november 1625',
 'january 24 1723',
 '1551 18 nov',
 'aug 10 1591',
 '3 november 2064',
 '1554 21 jun',
 'december 29 1661',
 'monday december 15 1919',
 'may 8 1727',
 'friday 8 february 2064',
 '8/27/05',
 '28/05/1572',
 'wed 7 nov 1860',
 'october 19 1761',
 'march 30 1816',
 '28 august 1795',
 '12/21/93',
 'march 29 1980',
 'august 17 1706',
 'december 11 1775',
 'february 20 1899',
 '9/12/35',
 'aug 24 1893',
 '31 july 1897',
 'sunday april 28 1624',
 'dec 30 1562',
 'may 2 1580',
 'jan 1 1665',
 '22 september 1861',
 'january 1 1872',
 '1992 12 april',
 'october 20 1885',
 'may 20 1649',
 '11 aug 1600',
 'thursday september 22 1644',
 '21 may 1572',
 'sun 25 jun 1820',
 'tue 18 may 2055',
 'nov 18 2057',
 'sunday november 17 1946',
 '25 may 1658',
 '1740 3 sep',
 'nov 22 1986',
 'mar 24 1950',
 '1864 29 mar',
 'saturday january 26 1658',
 '7 march 1939',
 'may 27 1793',
 'tuesday december 9 1845',
 '5 september 1806',
 'apr 13 1648',
 'april 8 1702'

In [6]:
class Preprocessing:
    def __init__(self):
        pass
    def lowerCase(self ,text):
        return text.lower()
    def splitText(self ,text):
        return text.replace('/',' ').replace('-', ' ').split()
    def correctNumber(self,text):
        ''' text is the array of strings.'''
        num = [str(i) for i in range(1,10)]

        text = ['0'+t if t in num else t for t in text]
        return text

    def preprocess(self,text):
        text= self.lowerCase(text) 
        text= self.splitText(text) 
        text= self.correctNumber(text) 
        return text

In [7]:
pre = Preprocessing()
# input = [pre.preprocess(inp) for inp in input]
# output = [pre.preprocess(out) for out in output]
inp_words ,out_words = [] ,[]
for inp in input:
    inp_words.extend(inp)

for out in output:
    for o in out:
        out_words.extend(o)

In [8]:
counter = Counter(inp_words)
inp_word2int = dict(zip(counter.keys() , np.arange(1, 1+len(counter))))
inp_word2int['<PAD>'] = 0
inp_word2int

{'m': 1,
 'a': 2,
 'y': 3,
 ' ': 4,
 '2': 5,
 '0': 6,
 '3': 7,
 '4': 8,
 '9': 9,
 '1': 10,
 '6': 11,
 '5': 12,
 '/': 13,
 'r': 14,
 '7': 15,
 'j': 16,
 'u': 17,
 'n': 18,
 'f': 19,
 'i': 20,
 'd': 21,
 'w': 22,
 'e': 23,
 's': 24,
 'p': 25,
 't': 26,
 'b': 27,
 '8': 28,
 'o': 29,
 'c': 30,
 'l': 31,
 'v': 32,
 'g': 33,
 'h': 34,
 '<PAD>': 0}

In [9]:
unique_out_words = np.unique(out_words)
out_word2int = dict(zip(unique_out_words ,np.arange(3,3+len(unique_out_words))))
out_word2int['<E>'] = 2
out_word2int['<S>'] = 1
out_word2int['<PAD>'] = 0
out_word2int

{'-': 3,
 '0': 4,
 '1': 5,
 '2': 6,
 '3': 7,
 '4': 8,
 '5': 9,
 '6': 10,
 '7': 11,
 '8': 12,
 '9': 13,
 '<E>': 2,
 '<S>': 1,
 '<PAD>': 0}

In [10]:
## input and output encoder
input = [[inp_word2int[w] for w in inp] for inp in input]
output =[[out_word2int[w] for w in out] for out in output]
test_input = [[inp_word2int[w] for w in inp] for inp in test_input]
test_output =[[out_word2int[w] for w in out] for out in test_output]


In [11]:
test_input[2] 

[16, 2, 18, 17, 2, 14, 3, 4, 5, 8, 4, 10, 15, 5, 7]

In [12]:
np.array(output)

array([[ 6,  4,  7, ...,  3,  6,  4],
       [ 5, 10,  7, ...,  3,  4, 13],
       [ 6,  4,  5, ...,  3,  5,  9],
       ...,
       [ 5, 12,  9, ...,  3,  5, 12],
       [ 6,  4, 11, ...,  3,  5,  5],
       [ 5, 11,  5, ...,  3,  6, 11]])

In [13]:
output_for_labels = np.array([np.append(out,[out_word2int['<E>']]) for out in output])
test_output_for_labels = np.array([np.append(out,[out_word2int['<E>']]) for out in test_output])

In [14]:
output = [np.append([out_word2int['<S>']] ,out) for out in output]
test_output = [np.append([out_word2int['<S>']] ,out) for out in test_output]

In [15]:
test_output = np.array(test_output)

In [16]:
input[:5] , output[:5] , output_for_labels[:5] , test_output_for_labels[:5]

([[1, 2, 3, 4, 5, 6, 4, 5, 6, 7, 8],
  [9, 4, 1, 2, 3, 4, 10, 11, 7, 6],
  [10, 12, 13, 6, 7, 13, 5, 6, 10, 8],
  [1, 2, 14, 4, 10, 11, 4, 10, 11, 15, 12],
  [16, 17, 18, 4, 10, 11, 4, 10, 11, 8, 6]],
 [array([1, 6, 4, 7, 8, 3, 4, 9, 3, 6, 4]),
  array([ 1,  5, 10,  7,  4,  3,  4,  9,  3,  4, 13]),
  array([1, 6, 4, 5, 8, 3, 4, 7, 3, 5, 9]),
  array([ 1,  5, 10, 11,  9,  3,  4,  7,  3,  5, 10]),
  array([ 1,  5, 10,  8,  4,  3,  4, 10,  3,  5, 10])],
 array([[ 6,  4,  7,  8,  3,  4,  9,  3,  6,  4,  2],
        [ 5, 10,  7,  4,  3,  4,  9,  3,  4, 13,  2],
        [ 6,  4,  5,  8,  3,  4,  7,  3,  5,  9,  2],
        [ 5, 10, 11,  9,  3,  4,  7,  3,  5, 10,  2],
        [ 5, 10,  8,  4,  3,  4, 10,  3,  5, 10,  2]]),
 array([[ 5, 11,  7,  7,  3,  4, 12,  3,  5,  8,  2],
        [ 5, 10,  6,  9,  3,  5,  5,  3,  6,  8,  2],
        [ 5, 11,  6,  7,  3,  4,  5,  3,  6,  8,  2],
        [ 5,  9,  9,  5,  3,  5,  5,  3,  5, 12,  2],
        [ 5,  9, 13,  5,  3,  4, 12,  3,  5,  4,  2]]))

In [17]:
len_input , len_output = np.array([len(inp) for inp in input]) , np.array([len(out)for out in output])
test_len_input = np.array([len(inp) for inp in test_input]) 

In [18]:
def padding(inputs , seq_len , pad_id):
    pad_inputs = np.full(shape = (len(inputs),seq_len) , fill_value= pad_id)
    for i,x in enumerate(inputs):
        pad_inputs[i ,:len(x)] = x 
    return pad_inputs

input = padding(input, max(len_input), inp_word2int['<PAD>'])
output = padding(output,max(len_output), out_word2int['<PAD>'])
test_input = padding(test_input,max(test_len_input),inp_word2int['<PAD>'])
print(f'Input Shape : {input.shape} , Output Shape : {output.shape} , Test Inp : {test_input.shape}')

Input Shape : (40000, 27) , Output Shape : (40000, 11) , Test Inp : (10000, 27)


In [20]:
train_input , val_input , train_output, val_output , train_input_len , val_input_len , train_output_labels , val_output_labels = train_test_split(input , output ,len_input , output_for_labels, test_size = 0.1)
print(f'Train Input Shape : {train_input.shape} , Train Output Shape : {train_output.shape} , Train Input Len Shape : {train_input_len.shape} , Train Output Len Shape :{train_output_labels.shape}')
print(f'Val Input Shape : {val_input.shape} , Val Output Shape : {val_output.shape},Val Input Len Shape : {val_input_len.shape} , Val Output Len Shape :{val_output_labels.shape}')

Train Input Shape : (36000, 27) , Train Output Shape : (36000, 11) , Train Input Len Shape : (36000,) , Train Output Len Shape :(36000, 11)
Val Input Shape : (4000, 27) , Val Output Shape : (4000, 11),Val Input Len Shape : (4000,) , Val Output Len Shape :(4000, 11)


In [21]:
train = TensorDataset(torch.from_numpy(train_input) , torch.from_numpy(train_output) ,torch.from_numpy(train_input_len) , torch.from_numpy(train_output_labels))
validation = TensorDataset(torch.from_numpy(val_input) , torch.from_numpy(val_output) , torch.from_numpy(val_input_len) , torch.from_numpy(val_output_labels))
test = TensorDataset(torch.from_numpy(test_input) , torch.from_numpy(test_output) , torch.from_numpy(test_len_input) , torch.from_numpy(test_output_for_labels))

BATCH_SIZE = 16
trainloader = DataLoader(train , shuffle = True, batch_size = BATCH_SIZE)
valloader = DataLoader(validation, shuffle= False, batch_size = BATCH_SIZE)
testloader = DataLoader(test, shuffle= False, batch_size = BATCH_SIZE)

In [22]:
class Encoder(nn.Module):
    def __init__(self , vocab_size , emb_size, hidden_size , num_layers ,pad_id):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(self.vocab_size , self.emb_size , padding_idx= pad_id )
        self.GRU = nn.GRU(input_size = self.emb_size , hidden_size = self.hidden_size , num_layers = self.num_layers , bidirectional = True , batch_first = True)

    def forward(self, x):
        # h0 = self.initialize_weights(x.shape[0])
        embs = self.embedding(x)
        output , hn = self.GRU(embs)
        return output , hn
    
    def initialize_weights(self, batch_size):
        h0 = torch.rand(size = (self.num_layers * 2 ,batch_size, self.hidden_size))
        return h0
        

In [23]:
encoder = Encoder(len(inp_word2int) , emb_size= 30 , hidden_size= 30, num_layers=1 , pad_id= inp_word2int['<PAD>'] )
print(encoder)

Encoder(
  (embedding): Embedding(35, 30, padding_idx=0)
  (GRU): GRU(30, 30, batch_first=True, bidirectional=True)
)


In [24]:
def masked_softmax(X ,valid_len):
    if valid_len == None:
      return nn.Softmax(dim = -1)(X)
    maxlen = X.shape[-1]
    if valid_len.dim() == 1:
        valid_len = valid_len.unsqueeze(1)
    # print(valid_len.shape , valid_len.dim())
    mask = torch.arange(maxlen)[None, :] < valid_len[:,:,None]
    # print(mask.shape, X.shape )
    X[~mask] = float('-inf')
    return nn.Softmax(dim = -1)(X)



class MLPAttention(nn.Module):
    def __init__(self, units, k_dim ,q_dim ,dropout):
        super(MLPAttention , self).__init__()
        self.W_k = nn.Linear(k_dim , units ,bias= False)
        self.W_q = nn.Linear(q_dim , units ,bias= False)
        self.v = nn.Linear(units , 1,bias = False)
        self.dropout = nn.Dropout(dropout)

    def forward(self,query ,key ,value,valid_len):
        query, key = self.W_q(query) , self.W_k(key)
        # print('Query ',query , key , value)
        features = query.unsqueeze(dim=2) + key.unsqueeze(dim=1)
        features = torch.tanh(features)
        scores = self.v(features).squeeze(dim=-1)
        attention_weights = self.dropout(masked_softmax(scores, valid_len))

        return torch.bmm(attention_weights , value)
        


In [25]:
attn = MLPAttention(8,2,2,0.1)
value = np.arange(40).reshape(1,10,4).repeat(2 ,axis =0)
attn(torch.ones((2,1,2)) , torch.ones((2,10,2)) , torch.from_numpy(value).float(),torch.from_numpy(np.array([2,6]))) 

tensor([[[2.2222, 3.3333, 4.4444, 5.5556]],

        [[6.6667, 7.4074, 8.1481, 8.8889]]], grad_fn=<BmmBackward0>)

In [27]:
class Decoder(nn.Module):
    def __init__(self, vocab_size , emb_size, hidden_size , num_layers ,pad_id, enc_o_last_dim ,dropout):
        super(Decoder,self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(self.vocab_size , self.emb_size , padding_idx= pad_id)
        self.GRU = nn.GRU(input_size = self.emb_size + 2 * self.hidden_size , hidden_size = self.hidden_size, \
            num_layers = self.num_layers , batch_first = True)
        self.attn_layer = MLPAttention(units = hidden_size, k_dim = enc_o_last_dim, q_dim = self.hidden_size ,dropout = dropout)
        self.linear_softmax = nn.Sequential(
            nn.Linear(self.hidden_size , self.vocab_size),
            nn.Softmax(dim= -1)
        )
    def forward(self, X, state, train = True):
        enc_outputs , hidden_state, enc_valid_len = state
        # print("OUT:", state)
        X = self.embedding(X)
        outputs = []
        X = torch.swapaxes(X,0,1)
        teacher_forcing_ratio = 0.5
        for i in range(len(X)):
            if i == 0:
                x = X[i].unsqueeze(dim = 1)
            else:
                if train:
                    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
                    if use_teacher_forcing:
                        x = X[i].unsqueeze(dim = 1)
                    else:
                        x = out.detach()
                else:
                    x = out.detach()
                    
            query = hidden_state[0].unsqueeze(dim = 1)
            context = self.attn_layer(query= query , key = enc_outputs, value = enc_outputs ,valid_len = enc_valid_len )
            x = torch.concat((context , x),dim=-1)
            out,hidden_state = self.GRU(x,hidden_state[0].unsqueeze(dim = 0))
            # print(out.shape)
            outputs.append(out)
        
        # print('outputs:' ,outputs)
        outputs = self.linear_softmax(torch.concat(outputs , dim=1))
        return outputs , hidden_state

In [28]:
encoder = Encoder(len(inp_word2int) , emb_size= 300 , hidden_size= 300, num_layers=1 , pad_id= inp_word2int['<PAD>'] )
print(encoder)
decoder = Decoder(len(out_word2int) , 300 , 300, 1, out_word2int['<PAD>'], 600, 0.1)

Encoder(
  (embedding): Embedding(35, 300, padding_idx=0)
  (GRU): GRU(300, 300, batch_first=True, bidirectional=True)
)


In [29]:
from itertools import chain
from tqdm import tqdm
lr = 0.001
# enc_optimizer = torch.optim.AdamW(encoder.parameters() ,lr = 0.001)
# dec_optimizer = torch.optim.AdamW(decoder.parameters() ,lr = 0.001)
optimizer = torch.optim.AdamW(chain(encoder.parameters(),decoder.parameters()) ,lr = lr)
criteria = nn.CrossEntropyLoss()

In [30]:
def exactmatch(decoder_output ,actual_output):
    sum = torch.sum(decoder_output == actual_output , axis = 1)
    # print(sum)
    return torch.sum(sum == 11).item() 

def train(encoder , decoder, trainloader , valloader ,testloader, optimizer , criteria ,epochs ,device):

    for epoch in tqdm(range(epochs)):
        m_loss =[]
        m_acc = []
        encoder.train()
        decoder.train()
        total = 0
        correct = 0
        total_example = 0
        exact_match = 0
        for I, O, I_len , O_labels in tqdm(trainloader):
            I, O, I_len , O_labels = I.to(device), O.to(device), I_len , O_labels.to(device)
            loss = 0
            optimizer.zero_grad()
            # dec_optimizer.zero_grad()
            enc_o , enc_h = encoder(I)
            state = enc_o, enc_h , I_len
            decoder_o , decoder_h = decoder(O , state)
            onehotlabels = F.one_hot(O_labels , len(out_word2int) ).float()
            # print(decoder_o ,onehotlabels)
            loss = criteria(decoder_o,onehotlabels)
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(encoder.parameters(), 5)
            # torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5)
            optimizer.step()
            # dec_optimizer.step()
            m_loss.append(loss.item())
            accuracy = torch.mean(torch.sum(torch.argmax(decoder_o ,dim = -1) ==  O_labels ,dim = 1).float())
            correct += torch.sum(torch.argmax(decoder_o ,dim = -1) ==  O_labels)
            total += torch.sum(torch.argmax(onehotlabels ,dim = -1) ==  O_labels)
            exact_match += exactmatch(torch.argmax(decoder_o ,dim = -1) , O_labels)
            total_example+=len(O_labels)
            

        encoder.eval()
        decoder.eval()
        v_loss = []
        val_correct , val_total = 0,0
        val_total_example = 0
        val_exact_match = 0
        with torch.no_grad():
            for I, O, I_len , O_labels in tqdm(valloader):
                I, O, I_len , O_labels = I.to(device), O.to(device), I_len , O_labels.to(device)
                loss = 0
                enc_o , enc_h = encoder(I)
                state = enc_o, enc_h , I_len
                decoder_o , decoder_h = decoder(O , state ,False)
                onehotlabels = F.one_hot(O_labels , len(out_word2int) ).float()
                # print(decoder_o ,onehotlabels)
                loss = criteria(decoder_o,onehotlabels)
                v_loss.append(loss.item())
                val_correct += torch.sum(torch.argmax(decoder_o ,dim = -1) ==  O_labels)
                val_total += torch.sum(torch.argmax(onehotlabels ,dim = -1) ==  O_labels)
                val_exact_match += exactmatch(torch.argmax(decoder_o ,dim = -1) , O_labels)
                val_total_example+=len(O_labels)
        
        test_loss = []
        test_correct , test_total = 0,0
        test_total_example = 0
        test_exact_match = 0
        with torch.no_grad():
            for I, O, I_len , O_labels in tqdm(testloader):
                I, O, I_len , O_labels = I.to(device), O.to(device), I_len , O_labels.to(device)
                loss = 0
                enc_o , enc_h = encoder(I)
                state = enc_o, enc_h , I_len
                decoder_o , decoder_h = decoder(O , state ,False)
                onehotlabels = F.one_hot(O_labels , len(out_word2int) ).float()
                # print(decoder_o ,onehotlabels)
                loss = criteria(decoder_o, onehotlabels)
                test_loss.append(loss.item())
                test_correct += torch.sum(torch.argmax(decoder_o ,dim = -1) ==  O_labels)
                test_total += torch.sum(torch.argmax(onehotlabels ,dim = -1) ==  O_labels)
                test_exact_match += exactmatch(torch.argmax(decoder_o ,dim = -1) , O_labels)
                test_total_example+=len(O_labels)
        print(f'epoch {epoch} || train loss : {np.mean(m_loss)} ||train acc : {correct/total} || ExactMatch train Acc :{exact_match/total_example} || val loss : {np.mean(v_loss)} || val acc : {val_correct/val_total} ||  ExactMatch val Acc :{val_exact_match/val_total_example} ||test loss : {np.mean(test_loss)} || test acc : {test_correct/test_total} ||  ExactMatch test Acc :{test_exact_match/test_total_example}     ')


In [31]:
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
encoder = encoder.to(device)
decoder = decoder.to(device)

train(encoder, decoder , trainloader , valloader,testloader, optimizer ,criteria , 100 , device)

100%|██████████| 2250/2250 [01:38<00:00, 22.74it/s]
100%|██████████| 250/250 [00:04<00:00, 58.39it/s]
100%|██████████| 625/625 [00:10<00:00, 61.25it/s]
  1%|          | 1/100 [01:53<3:07:12, 113.46s/it]

epoch 0 || train loss : 1.3511079547670153 ||train acc : 0.9073636531829834 || ExactMatch train Acc :0.5185277777777778 || val loss : 1.303944194316864 || val acc : 0.9772727489471436 ||  ExactMatch val Acc :0.76975 ||test loss : 1.302432506942749 || test acc : 0.9777363538742065 ||  ExactMatch test Acc :0.7738     


100%|██████████| 2250/2250 [01:39<00:00, 22.63it/s]
100%|██████████| 250/250 [00:03<00:00, 66.31it/s]
100%|██████████| 625/625 [00:09<00:00, 65.65it/s]
  2%|▏         | 2/100 [03:46<3:04:37, 113.04s/it]

epoch 1 || train loss : 1.296619619846344 ||train acc : 0.9860782623291016 || ExactMatch train Acc :0.8625833333333334 || val loss : 1.2918150806427002 || val acc : 0.9943863749504089 ||  ExactMatch val Acc :0.95 ||test loss : 1.2906443796157836 || test acc : 0.9942272901535034 ||  ExactMatch test Acc :0.9469     


100%|██████████| 2250/2250 [01:41<00:00, 22.11it/s]
100%|██████████| 250/250 [00:04<00:00, 61.55it/s]
100%|██████████| 625/625 [00:10<00:00, 57.86it/s]
  3%|▎         | 3/100 [05:42<3:05:23, 114.68s/it]

epoch 2 || train loss : 1.2908911911116705 ||train acc : 0.9940782785415649 || ExactMatch train Acc :0.9433888888888889 || val loss : 1.28994065618515 || val acc : 0.9963863492012024 ||  ExactMatch val Acc :0.968 ||test loss : 1.2886151863098145 || test acc : 0.996536374092102 ||  ExactMatch test Acc :0.9681     


100%|██████████| 2250/2250 [01:40<00:00, 22.43it/s]
100%|██████████| 250/250 [00:03<00:00, 64.94it/s]
100%|██████████| 625/625 [00:09<00:00, 64.76it/s]
  4%|▍         | 4/100 [07:36<3:02:57, 114.35s/it]

epoch 3 || train loss : 1.2902335262828404 ||train acc : 0.9948409199714661 || ExactMatch train Acc :0.9493611111111111 || val loss : 1.2898715505599976 || val acc : 0.9965454339981079 ||  ExactMatch val Acc :0.9685 ||test loss : 1.2885961896896363 || test acc : 0.9966182112693787 ||  ExactMatch test Acc :0.9685     


100%|██████████| 2250/2250 [01:41<00:00, 22.20it/s]
100%|██████████| 250/250 [00:04<00:00, 60.19it/s]
100%|██████████| 625/625 [00:09<00:00, 66.55it/s]
  5%|▌         | 5/100 [09:31<3:01:23, 114.56s/it]

epoch 4 || train loss : 1.2902296189202203 ||train acc : 0.9947373867034912 || ExactMatch train Acc :0.9479166666666666 || val loss : 1.2898617830276489 || val acc : 0.9965000152587891 ||  ExactMatch val Acc :0.968 ||test loss : 1.2886049385070801 || test acc : 0.9965817928314209 ||  ExactMatch test Acc :0.9682     


100%|██████████| 2250/2250 [01:41<00:00, 22.16it/s]
100%|██████████| 250/250 [00:04<00:00, 61.03it/s]
100%|██████████| 625/625 [00:12<00:00, 48.95it/s]
  6%|▌         | 6/100 [11:30<3:01:32, 115.88s/it]

epoch 5 || train loss : 1.2901486699846056 ||train acc : 0.9947752356529236 || ExactMatch train Acc :0.9480833333333333 || val loss : 1.2898428974151612 || val acc : 0.9965000152587891 ||  ExactMatch val Acc :0.968 ||test loss : 1.2885864786148071 || test acc : 0.9965454339981079 ||  ExactMatch test Acc :0.9677     


100%|██████████| 2250/2250 [01:59<00:00, 18.84it/s]
100%|██████████| 250/250 [00:04<00:00, 50.86it/s]
100%|██████████| 625/625 [00:12<00:00, 50.70it/s]
  7%|▋         | 7/100 [13:46<3:10:09, 122.68s/it]

epoch 6 || train loss : 1.2898538046942818 ||train acc : 0.9951995015144348 || ExactMatch train Acc :0.9518888888888889 || val loss : 1.2898183193206787 || val acc : 0.9965454339981079 ||  ExactMatch val Acc :0.9685 ||test loss : 1.288581279182434 || test acc : 0.9965454339981079 ||  ExactMatch test Acc :0.9678     


100%|██████████| 2250/2250 [01:59<00:00, 18.78it/s]
100%|██████████| 250/250 [00:05<00:00, 48.78it/s]
100%|██████████| 625/625 [00:12<00:00, 48.23it/s]
  8%|▊         | 8/100 [16:04<3:15:32, 127.53s/it]

epoch 7 || train loss : 1.2899288397894966 ||train acc : 0.9950429201126099 || ExactMatch train Acc :0.9509722222222222 || val loss : 1.2898626575469971 || val acc : 0.9964772462844849 ||  ExactMatch val Acc :0.968 ||test loss : 1.2885616296768188 || test acc : 0.9965817928314209 ||  ExactMatch test Acc :0.9682     


100%|██████████| 2250/2250 [01:58<00:00, 18.91it/s]
100%|██████████| 250/250 [00:04<00:00, 51.27it/s]
100%|██████████| 625/625 [00:12<00:00, 50.97it/s]
  9%|▉         | 9/100 [18:20<3:17:29, 130.21s/it]

epoch 8 || train loss : 1.289647921456231 ||train acc : 0.9953812956809998 || ExactMatch train Acc :0.95375 || val loss : 1.2898513278961181 || val acc : 0.9965454339981079 ||  ExactMatch val Acc :0.9685 ||test loss : 1.2886362779617309 || test acc : 0.9965454339981079 ||  ExactMatch test Acc :0.9677     


100%|██████████| 2250/2250 [01:59<00:00, 18.90it/s]
100%|██████████| 250/250 [00:05<00:00, 49.42it/s]
100%|██████████| 625/625 [00:12<00:00, 51.87it/s]
 10%|█         | 10/100 [20:36<3:18:04, 132.05s/it]

epoch 9 || train loss : 1.2899487833446928 ||train acc : 0.9950504899024963 || ExactMatch train Acc :0.951 || val loss : 1.2898915457725524 || val acc : 0.9964772462844849 ||  ExactMatch val Acc :0.96825 ||test loss : 1.2886474153518677 || test acc : 0.9964818358421326 ||  ExactMatch test Acc :0.9679     


100%|██████████| 2250/2250 [02:00<00:00, 18.67it/s]
100%|██████████| 250/250 [00:05<00:00, 49.29it/s]
100%|██████████| 625/625 [00:12<00:00, 50.07it/s]
 11%|█         | 11/100 [22:54<3:18:36, 133.89s/it]

epoch 10 || train loss : 1.2896793936623467 ||train acc : 0.9953712224960327 || ExactMatch train Acc :0.9537777777777777 || val loss : 1.289861198425293 || val acc : 0.9965227246284485 ||  ExactMatch val Acc :0.96825 ||test loss : 1.28862784614563 || test acc : 0.9965545535087585 ||  ExactMatch test Acc :0.968     


100%|██████████| 2250/2250 [01:57<00:00, 19.21it/s]
100%|██████████| 250/250 [00:05<00:00, 45.32it/s]
100%|██████████| 625/625 [00:12<00:00, 49.80it/s]
 12%|█▏        | 12/100 [25:10<3:16:57, 134.29s/it]

epoch 11 || train loss : 1.2898578886985779 ||train acc : 0.9951288104057312 || ExactMatch train Acc :0.9516666666666667 || val loss : 1.2899676508903504 || val acc : 0.9963409304618835 ||  ExactMatch val Acc :0.96675 ||test loss : 1.288691650390625 || test acc : 0.9964545369148254 ||  ExactMatch test Acc :0.9673     


100%|██████████| 2250/2250 [02:01<00:00, 18.57it/s]
100%|██████████| 250/250 [00:05<00:00, 48.12it/s]
100%|██████████| 625/625 [00:12<00:00, 50.48it/s]
 13%|█▎        | 13/100 [27:28<3:16:41, 135.65s/it]

epoch 12 || train loss : 1.2897043511072794 ||train acc : 0.9953863620758057 || ExactMatch train Acc :0.9538611111111112 || val loss : 1.289886923313141 || val acc : 0.9964772462844849 ||  ExactMatch val Acc :0.9675 ||test loss : 1.2886233057022094 || test acc : 0.9965454339981079 ||  ExactMatch test Acc :0.9676     


 56%|█████▋    | 1271/2250 [01:01<00:47, 20.66it/s]
 13%|█▎        | 13/100 [28:30<3:10:47, 131.58s/it]


KeyboardInterrupt: 