In [1]:
# !pip install indic-nlp-library
import torch
import torch.nn as nn
import numpy as np
import math
import random
from tqdm import tqdm
from transformers import BertTokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [3]:
# Tokenize input sentences
input_sentence = "Hello, how are you?"
tokens = tokenizer(input_sentence, return_tensors="pt")
print(tokens)
input_sentence_hin = "आपका हिंदी में टेक्स्ट यहां है I am here"
tokens_hin = tokenizer(input_sentence_hin, return_tensors="pt")
print(tokens_hin)
print(tokenizer.convert_ids_to_tokens(tokens_hin['input_ids'][0]))

{'input_ids': tensor([[  101, 31178,   117, 14796, 10301, 13028,   136,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101,   852, 18187, 15399, 58871, 10532,   875, 53809, 18869, 25695,
         62010, 10569,   146, 10392, 19353,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 'आ', '##प', '##का', 'हिंदी', 'में', 'ट', '##ेक', '##्स', '##्ट', 'यहां', 'है', 'I', 'am', 'here', '[SEP]']


In [4]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [5]:
with open(r"hin.txt", 'r', encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]

hindi_vocab_itos = {
    0: tokenizer.unk_token,
    1: tokenizer.pad_token,
}

english_vocab_itos = {
    0: tokenizer.unk_token,
    1: tokenizer.pad_token
}

hindi_vocab_stoi = {}
english_vocab_stoi = {}


all_hindi_tokens = []
all_english_tokens = []
text_pairs = []
max_seq_len = 0
for line in lines:
    eng, hin, rubbish = line.split("\t")
    eng = ' '.join([contraction_mapping[t.lower()] if t.lower() in contraction_mapping else t.lower() for t in eng.split()])
    text_pairs.append([eng, hin])
    max_seq_len = max(max_seq_len, len(tokenizer.tokenize(hin)), len(tokenizer.tokenize(eng)))
    all_hindi_tokens += tokenizer.encode(hin)
    all_english_tokens += tokenizer.encode(eng)

hindi_tokens_set = set(all_hindi_tokens)
english_tokens_set = set(all_english_tokens)

for i, token in enumerate(hindi_tokens_set, start = 2):
    hindi_vocab_itos[i] = tokenizer.convert_ids_to_tokens([token])[0]

for i, token in enumerate(english_tokens_set, start = 2):
    english_vocab_itos[i] = tokenizer.convert_ids_to_tokens([token])[0]


for id, string in hindi_vocab_itos.items():
    hindi_vocab_stoi[string] = id

for id, string in english_vocab_itos.items():
    english_vocab_stoi[string] = id

max_seq_len += 2 #to account for CLS and SEP tokens
random.shuffle(text_pairs)
print('Max_sequence length:', max_seq_len)
for i in range(5):
    print(random.choice(text_pairs))

Max_sequence length: 54
['do not open that.', 'वह मत खोलिए।']
['mary prided herself on her beauty.', 'मेरी को अपनी सुंदरता पर बहुत नाज़ था।']
["english is the world's language.", 'अंग्रेज़ी वैश्विक भाषा है।']
['i forgot.', 'मैं भूल गया।']
['i would not do that to anybody.', 'मैं ऐसा किसी के साथ नहीं करुंगी।']


In [6]:
num_train = int(0.70 * len(text_pairs))
num_val = int((len(text_pairs) - num_train) / 2)
train_pairs = text_pairs[:num_train]
val_pairs = text_pairs[num_train : num_train + num_val]
test_pairs = text_pairs[num_train + num_val :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

2979 total pairs
2085 training pairs
447 validation pairs
447 test pairs


In [7]:
train_eng = []
for pair in train_pairs:
    train_eng.append(pair[0])

train_hin = []
for pair in train_pairs:
    train_hin.append(pair[1])

target_hin = []
for pair in train_pairs:
    target_hin.append(pair[1])


In [8]:
# Add padding to the tokenized sequences
padded_tokens = tokenizer(train_eng, padding="max_length", truncation=True, return_tensors="pt", max_length=max_seq_len)
# Add padding to the tokenized sequences
padded_tokens_hin = tokenizer(train_hin, padding="max_length", truncation=True, return_tensors="pt", max_length=max_seq_len)

padded_tokens_target = tokenizer(target_hin, padding="max_length", truncation=True, return_tensors="pt", max_length=max_seq_len)


In [9]:
input_sentence = ["hello my name is", "what is the weather today"]
tokens = tokenizer(input_sentence)
print(tokens)
print(tokenizer.convert_ids_to_tokens(tokens['input_ids'][1]))




{'input_ids': [[101, 61694, 10133, 15127, 11324, 10124, 102], [101, 12976, 10124, 10105, 35660, 18745, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}
['[CLS]', 'what', 'is', 'the', 'weather', 'today', '[SEP]']


In [10]:
print(len(padded_tokens["input_ids"][0]))

54


In [11]:
input_ids = padded_tokens["input_ids"]
input_ids_hin = padded_tokens_hin["input_ids"]
target_ids = padded_tokens_target["input_ids"]
attention_mask = padded_tokens["attention_mask"]
attention_mask_hin = padded_tokens_hin["attention_mask"]
# print(attention_mask_hin, attention_mask, input_ids, input_ids_hin)
print(input_ids[0])

tensor([  101,   177, 10149, 10472, 34481, 13172, 22111,   119,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [12]:
english_tokens = tokenizer("my name is", padding="max_length", truncation=True, return_tensors="pt", max_length=max_seq_len)
print(len(english_tokens['input_ids'][0]))
english_text = tokenizer.decode(english_tokens['input_ids'][0])
print(english_text)


54
[CLS] my name is [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [13]:
print(input_ids_hin.shape)
print(input_ids.shape)

torch.Size([2085, 54])
torch.Size([2085, 54])


In [14]:
from torch.utils.data import Dataset, DataLoader
# Example usage
english_vocab_size = len(english_vocab_itos)  # Update with your actual vocabulary size
hindi_vocab_size = len(hindi_vocab_itos)
embed_dim = 512
d_model = 512
num_heads = 4
batch_size = 8
num_blocks = 2

def causal_mask(size):
    mask = torch.tril(torch.ones(num_heads, size, size)).type(torch.int)
    return mask

# Assume you have a custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, dataset_pairs, tokenizer, max_seq_len):
        self.dataset_pairs = dataset_pairs
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.dataset_pairs)

    def __getitem__(self, index):

        english = self.dataset_pairs[index][0]
        hindi = self.dataset_pairs[index][1]

        english = self.tokenizer.tokenize(english)
        hindi = self.tokenizer.tokenize(hindi)
        
        english_ids = [english_vocab_stoi["[PAD]"]]*self.max_seq_len
        hindi_ids = [hindi_vocab_stoi["[PAD]"]]*self.max_seq_len
        
        for i, string in enumerate(english):
            english_ids[i] = english_vocab_stoi[string]

        for i, string in enumerate(hindi):
            hindi_ids[i] = hindi_vocab_stoi[string]

        english_ids = torch.tensor(english_ids)
        hindi_ids = torch.tensor(hindi_ids)
        decoder_input = hindi_ids.clone()
        decoder_input[decoder_input == hindi_vocab_stoi["[SEP]"]] = hindi_vocab_stoi["[PAD]"]
        target = hindi_ids.clone()
        target = torch.cat((target[1:], torch.tensor([hindi_vocab_stoi["[PAD]"]])))

        encoder_mask = torch.tensor([elem.item() != english_vocab_stoi["[PAD]"] for elem in english_ids]).float()
        decoder_mask = torch.tensor([elem.item() != hindi_vocab_stoi["[PAD]"] for elem in decoder_input]).unsqueeze(0) & causal_mask(self.max_seq_len)
        return {
            'encoder_input': english_ids, 
            'decoder_input': decoder_input, 
            'target': target, 
            'encoder_mask': encoder_mask, 
            'decoder_mask': decoder_mask.to(torch.bool),
        }



# Create a dataset and data loader
translation_dataset = TranslationDataset(dataset_pairs=train_pairs, tokenizer=tokenizer, max_seq_len=max_seq_len)
train_data_loader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=True)



In [15]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, max_length, d_model):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_length, d_model)
        position = torch.arange(0, max_length).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)  # Add batch dimension
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def forward(self, x):
      return x + self.encoding[:, :x.size(1)].detach().to(self.device)


class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, max_seq_len):
        super(EncoderBlock, self).__init__()
        self.multi_head_attention = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.feedforward = nn.Sequential(
            nn.Linear(max_seq_len * d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, max_seq_len * d_model),
        )
        

    def forward(self, src, encoder_mask):
      
        out1, _ = self.multi_head_attention(src, src, src, key_padding_mask=encoder_mask)
        src = src + out1  
        src = self.norm1(src)
        flat = torch.flatten(src, start_dim=1)
        out2 = self.feedforward(flat)
        out2 = out2.view(src.shape)
        src = src + out2  
        src = self.norm1(src)

        return src

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, max_seq_len, num_blocks):
        super(Encoder, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)
        self.position_encoding = PositionalEncoding(max_length=max_seq_len, d_model=d_model)
        self.encoder_blocks = nn.ModuleList([EncoderBlock(d_model, num_heads, max_seq_len) for _ in range(num_blocks)])
        
    def forward(self, src, encoder_mask):

        src = self.embedding_layer(src)
        src = self.position_encoding(src)
        
        for encoder_block in self.encoder_blocks:
            src = encoder_block(src, encoder_mask)

        return src


In [16]:

import torch.nn.functional as F

class DecoderBlock(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, max_seq_len, batch_size):
        super(DecoderBlock, self).__init__()
        self.self_attention = nn.MultiheadAttention(d_model, num_heads=num_heads, batch_first = True)
        self.feedforward = nn.Sequential(
            nn.Linear(d_model*max_seq_len, 2048),
            nn.ReLU(),
            nn.Linear(2048, d_model*max_seq_len)
        )
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.batch_size=batch_size
        self.num_heads = num_heads
        self.max_seq_len = max_seq_len

    def forward(self, embedded_hin, decoder_mask, encoder_output):
      
        decoder_mask=decoder_mask.view(-1, self.max_seq_len, self.max_seq_len)
        self_attention_output, _ = self.self_attention(embedded_hin, embedded_hin, embedded_hin, attn_mask=decoder_mask)
        self_attention_output = self.layer_norm1(embedded_hin + self_attention_output)
        multi_out, _ = self.self_attention(encoder_output, encoder_output, self_attention_output, attn_mask=decoder_mask)
        multi_out = self.layer_norm1(multi_out + self_attention_output)
        flat = torch.flatten(multi_out, start_dim =1)
        feedforward_output = self.feedforward(flat)
        feedforward_output = feedforward_output.view(multi_out.shape)
        decoder_output = self.layer_norm1(multi_out + feedforward_output)
        

        return decoder_output

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, max_seq_len, batch_size, num_blocks):
        super(Decoder, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(vocab_size, d_model)
        self.decoder_block = nn.ModuleList([DecoderBlock(vocab_size, d_model, num_heads, max_seq_len, batch_size) for _ in range(num_blocks)])
        
    def forward(self, src, decoder_mask, encoder_output):
        
        src = self.embedding_layer(src)
        src = self.positional_encoding(src)

        for decoder_block in self.decoder_block:
            src = decoder_block(src, decoder_mask, encoder_output)

        return src

In [17]:
class ProjectionLayer(nn.Module):
    def __init__(self, vocab_size, d_model) -> None:
        super(ProjectionLayer, self).__init__()

        self.linear = nn.Linear(d_model, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        return self.linear(x)

In [18]:
class Transformer(nn.Module):
    def __init__(self, english_vocab_size, hindi_vocab_size, d_model, num_heads, max_seq_len, num_blocks_encoder, num_blocks_decoder, batch_size) -> None:
        super(Transformer, self).__init__()

        self.encoder = Encoder(english_vocab_size, d_model, num_heads, max_seq_len=max_seq_len, num_blocks=num_blocks_encoder)
        self.decoder = Decoder(hindi_vocab_size, d_model, num_heads, max_seq_len=max_seq_len, batch_size=batch_size, num_blocks=num_blocks_decoder)
        self.projection = ProjectionLayer(hindi_vocab_size, d_model)
    
    def forward(self, batch, device = 'cpu'):

        out = self.encoder(batch['encoder_input'].to(device), batch['encoder_mask'].to(device))
        out = self.decoder(batch['decoder_input'].to(device), batch['decoder_mask'].to(device), out)
        out = self.projection(out)
        return out

In [23]:
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device', device)

# Define a loss function (e.g., cross-entropy) and an optimizer (e.g., Adam)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id).to(device)
net = Transformer(english_vocab_size, hindi_vocab_size, d_model, num_heads, max_seq_len=max_seq_len, batch_size=batch_size, num_blocks_encoder=num_blocks, num_blocks_decoder=num_blocks).to(device)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001)
net.train()

# Training loop
num_epochs = 100  # Adjust as needed
# Example of training loop
for epoch in tqdm(range(num_epochs)):  # Run for 3 epochs as an example
    for batch in train_data_loader:
        out = net(batch, device)
        # Compute the loss
        target = batch['target'].to(device).long()
        loss = criterion(out.view(-1, hindi_vocab_size), target.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Print the loss after each epoch
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# Save the trained models if needed
torch.save(net.state_dict(), 'net.pth')


Using device cuda


  1%|          | 1/100 [00:18<31:03, 18.82s/it]

Epoch 1/100, Loss: 1.4950737953186035


  2%|▏         | 2/100 [00:38<31:06, 19.05s/it]

Epoch 2/100, Loss: 1.2518571615219116


  3%|▎         | 3/100 [00:56<30:26, 18.83s/it]

Epoch 3/100, Loss: 1.081704020500183


  4%|▍         | 4/100 [01:15<29:56, 18.71s/it]

Epoch 4/100, Loss: 1.5252195596694946


  5%|▌         | 5/100 [01:33<29:31, 18.65s/it]

Epoch 5/100, Loss: 1.087630033493042


  6%|▌         | 6/100 [01:52<29:15, 18.67s/it]

Epoch 6/100, Loss: 1.1249574422836304


  7%|▋         | 7/100 [02:11<29:00, 18.72s/it]

Epoch 7/100, Loss: 1.3743969202041626


  8%|▊         | 8/100 [02:30<28:56, 18.87s/it]

Epoch 8/100, Loss: 1.1870533227920532


  9%|▉         | 9/100 [02:49<28:38, 18.89s/it]

Epoch 9/100, Loss: 0.7773582339286804


 10%|█         | 10/100 [03:07<28:10, 18.79s/it]

Epoch 10/100, Loss: 1.3084651231765747


 11%|█         | 11/100 [03:25<27:33, 18.58s/it]

Epoch 11/100, Loss: 1.0319551229476929


 12%|█▏        | 12/100 [03:44<27:00, 18.42s/it]

Epoch 12/100, Loss: 1.2776039838790894


 13%|█▎        | 13/100 [04:02<26:44, 18.44s/it]

Epoch 13/100, Loss: 1.2801989316940308


 14%|█▍        | 14/100 [04:20<26:24, 18.42s/it]

Epoch 14/100, Loss: 1.0797854661941528


 15%|█▌        | 15/100 [04:39<26:05, 18.42s/it]

Epoch 15/100, Loss: 1.3285444974899292


 16%|█▌        | 16/100 [04:57<25:45, 18.40s/it]

Epoch 16/100, Loss: 0.9949142932891846


 17%|█▋        | 17/100 [05:16<25:28, 18.42s/it]

Epoch 17/100, Loss: 1.0045530796051025


 18%|█▊        | 18/100 [05:34<25:07, 18.38s/it]

Epoch 18/100, Loss: 1.530339241027832


 19%|█▉        | 19/100 [05:52<24:48, 18.38s/it]

Epoch 19/100, Loss: 1.1044594049453735


 20%|██        | 20/100 [06:10<24:22, 18.28s/it]

Epoch 20/100, Loss: 1.133337378501892


 21%|██        | 21/100 [06:29<24:03, 18.27s/it]

Epoch 21/100, Loss: 1.122960090637207


 22%|██▏       | 22/100 [06:47<23:43, 18.26s/it]

Epoch 22/100, Loss: 1.0763630867004395


 23%|██▎       | 23/100 [07:05<23:24, 18.24s/it]

Epoch 23/100, Loss: 0.8910660147666931


 24%|██▍       | 24/100 [07:23<22:57, 18.13s/it]

Epoch 24/100, Loss: 1.0378060340881348


 25%|██▌       | 25/100 [07:41<22:29, 18.00s/it]

Epoch 25/100, Loss: 1.2833051681518555


 26%|██▌       | 26/100 [08:00<22:34, 18.30s/it]

Epoch 26/100, Loss: 0.9276813268661499


 27%|██▋       | 27/100 [08:19<22:34, 18.55s/it]

Epoch 27/100, Loss: 1.1363412141799927


 28%|██▊       | 28/100 [08:38<22:27, 18.72s/it]

Epoch 28/100, Loss: 1.104583978652954


 29%|██▉       | 29/100 [08:57<22:21, 18.89s/it]

Epoch 29/100, Loss: 0.89387047290802


 30%|███       | 30/100 [09:16<22:06, 18.95s/it]

Epoch 30/100, Loss: 1.209564208984375


 31%|███       | 31/100 [09:35<21:47, 18.95s/it]

Epoch 31/100, Loss: 1.0233205556869507


 32%|███▏      | 32/100 [09:54<21:22, 18.85s/it]

Epoch 32/100, Loss: 0.9159115552902222


 33%|███▎      | 33/100 [10:12<20:59, 18.79s/it]

Epoch 33/100, Loss: 1.1734423637390137


 34%|███▍      | 34/100 [10:32<20:54, 19.01s/it]

Epoch 34/100, Loss: 1.3295859098434448


 35%|███▌      | 35/100 [10:51<20:41, 19.09s/it]

Epoch 35/100, Loss: 1.524821400642395


 36%|███▌      | 36/100 [11:10<20:20, 19.07s/it]

Epoch 36/100, Loss: 1.0756028890609741


 37%|███▋      | 37/100 [11:29<20:02, 19.08s/it]

Epoch 37/100, Loss: 1.0535496473312378


 38%|███▊      | 38/100 [11:48<19:35, 18.97s/it]

Epoch 38/100, Loss: 1.0953799486160278


 39%|███▉      | 39/100 [12:07<19:10, 18.87s/it]

Epoch 39/100, Loss: 1.3289161920547485


 40%|████      | 40/100 [12:25<18:45, 18.75s/it]

Epoch 40/100, Loss: 1.0590287446975708


 41%|████      | 41/100 [12:44<18:24, 18.71s/it]

Epoch 41/100, Loss: 0.7571190595626831


 42%|████▏     | 42/100 [13:02<18:01, 18.64s/it]

Epoch 42/100, Loss: 1.116459846496582


 43%|████▎     | 43/100 [13:21<17:36, 18.54s/it]

Epoch 43/100, Loss: 1.1811484098434448


 44%|████▍     | 44/100 [13:39<17:13, 18.45s/it]

Epoch 44/100, Loss: 1.223030924797058


 45%|████▌     | 45/100 [13:57<16:56, 18.47s/it]

Epoch 45/100, Loss: 1.0260226726531982


 46%|████▌     | 46/100 [14:16<16:35, 18.43s/it]

Epoch 46/100, Loss: 1.2381945848464966


 47%|████▋     | 47/100 [14:34<16:21, 18.51s/it]

Epoch 47/100, Loss: 0.9486432075500488


 48%|████▊     | 48/100 [14:53<16:05, 18.56s/it]

Epoch 48/100, Loss: 1.2970080375671387


 49%|████▉     | 49/100 [15:12<15:45, 18.53s/it]

Epoch 49/100, Loss: 0.862549364566803


 50%|█████     | 50/100 [15:31<15:34, 18.69s/it]

Epoch 50/100, Loss: 0.9206745624542236


 51%|█████     | 51/100 [15:50<15:21, 18.81s/it]

Epoch 51/100, Loss: 0.7208049297332764


 52%|█████▏    | 52/100 [16:09<15:09, 18.95s/it]

Epoch 52/100, Loss: 1.0780538320541382


 53%|█████▎    | 53/100 [16:28<14:54, 19.04s/it]

Epoch 53/100, Loss: 1.0832610130310059


 54%|█████▍    | 54/100 [16:46<14:20, 18.71s/it]

Epoch 54/100, Loss: 1.1265733242034912


 55%|█████▌    | 55/100 [17:04<13:51, 18.48s/it]

Epoch 55/100, Loss: 0.9983500242233276


 56%|█████▌    | 56/100 [17:22<13:23, 18.26s/it]

Epoch 56/100, Loss: 1.1652750968933105


 57%|█████▋    | 57/100 [17:40<13:01, 18.17s/it]

Epoch 57/100, Loss: 1.0769610404968262


 58%|█████▊    | 58/100 [17:58<12:38, 18.07s/it]

Epoch 58/100, Loss: 1.0065721273422241


 59%|█████▉    | 59/100 [18:15<12:17, 18.00s/it]

Epoch 59/100, Loss: 0.9180402755737305


 60%|██████    | 60/100 [18:33<11:56, 17.92s/it]

Epoch 60/100, Loss: 1.06667160987854


 61%|██████    | 61/100 [18:51<11:36, 17.87s/it]

Epoch 61/100, Loss: 0.9752997756004333


 62%|██████▏   | 62/100 [19:09<11:20, 17.90s/it]

Epoch 62/100, Loss: 1.0505688190460205


 63%|██████▎   | 63/100 [19:27<11:02, 17.90s/it]

Epoch 63/100, Loss: 0.9450486898422241


 64%|██████▍   | 64/100 [19:45<10:44, 17.89s/it]

Epoch 64/100, Loss: 1.121248483657837


 65%|██████▌   | 65/100 [20:03<10:27, 17.94s/it]

Epoch 65/100, Loss: 0.9675180912017822


 66%|██████▌   | 66/100 [20:21<10:13, 18.05s/it]

Epoch 66/100, Loss: 1.2854645252227783


 67%|██████▋   | 67/100 [20:39<09:58, 18.14s/it]

Epoch 67/100, Loss: 1.1595577001571655


 68%|██████▊   | 68/100 [20:58<09:44, 18.26s/it]

Epoch 68/100, Loss: 0.9123144745826721


 69%|██████▉   | 69/100 [21:16<09:25, 18.25s/it]

Epoch 69/100, Loss: 1.1933326721191406


 70%|███████   | 70/100 [21:34<09:08, 18.28s/it]

Epoch 70/100, Loss: 1.0693202018737793


 71%|███████   | 71/100 [21:53<08:50, 18.30s/it]

Epoch 71/100, Loss: 1.1079955101013184


 72%|███████▏  | 72/100 [22:11<08:31, 18.27s/it]

Epoch 72/100, Loss: 1.1809109449386597


 73%|███████▎  | 73/100 [22:29<08:09, 18.12s/it]

Epoch 73/100, Loss: 1.4490655660629272


 74%|███████▍  | 74/100 [22:47<07:48, 18.01s/it]

Epoch 74/100, Loss: 0.9102973937988281


 75%|███████▌  | 75/100 [23:04<07:28, 17.95s/it]

Epoch 75/100, Loss: 0.7364130020141602


 76%|███████▌  | 76/100 [23:22<07:10, 17.93s/it]

Epoch 76/100, Loss: 1.0212128162384033


 77%|███████▋  | 77/100 [23:41<06:57, 18.14s/it]

Epoch 77/100, Loss: 1.0772887468338013


 78%|███████▊  | 78/100 [24:00<06:43, 18.35s/it]

Epoch 78/100, Loss: 1.2004708051681519


 79%|███████▉  | 79/100 [24:19<06:29, 18.57s/it]

Epoch 79/100, Loss: 1.1417198181152344


 80%|████████  | 80/100 [24:38<06:13, 18.70s/it]

Epoch 80/100, Loss: 0.9991678595542908


 81%|████████  | 81/100 [24:56<05:53, 18.61s/it]

Epoch 81/100, Loss: 0.9045661091804504


 82%|████████▏ | 82/100 [25:14<05:30, 18.36s/it]

Epoch 82/100, Loss: 0.8693292140960693


 83%|████████▎ | 83/100 [25:32<05:09, 18.20s/it]

Epoch 83/100, Loss: 1.1541393995285034


 84%|████████▍ | 84/100 [25:50<04:49, 18.08s/it]

Epoch 84/100, Loss: 0.9447544813156128


 85%|████████▌ | 85/100 [26:08<04:30, 18.03s/it]

Epoch 85/100, Loss: 0.7450308799743652


 86%|████████▌ | 86/100 [26:25<04:11, 17.98s/it]

Epoch 86/100, Loss: 1.1657819747924805


 87%|████████▋ | 87/100 [26:43<03:53, 17.95s/it]

Epoch 87/100, Loss: 0.8745642304420471


 88%|████████▊ | 88/100 [27:01<03:35, 17.93s/it]

Epoch 88/100, Loss: 1.149208903312683


 89%|████████▉ | 89/100 [27:19<03:17, 17.92s/it]

Epoch 89/100, Loss: 0.7350824475288391


 90%|█████████ | 90/100 [27:37<02:59, 17.92s/it]

Epoch 90/100, Loss: 0.9980380535125732


 91%|█████████ | 91/100 [27:55<02:41, 17.91s/it]

Epoch 91/100, Loss: 1.3997743129730225


 92%|█████████▏| 92/100 [28:13<02:23, 17.89s/it]

Epoch 92/100, Loss: 1.168387770652771


 93%|█████████▎| 93/100 [28:31<02:05, 17.93s/it]

Epoch 93/100, Loss: 0.9098108410835266


 94%|█████████▍| 94/100 [28:49<01:48, 18.08s/it]

Epoch 94/100, Loss: 1.1113200187683105


 95%|█████████▌| 95/100 [29:07<01:30, 18.15s/it]

Epoch 95/100, Loss: 1.142215371131897


 96%|█████████▌| 96/100 [29:25<01:12, 18.00s/it]

Epoch 96/100, Loss: 1.1461313962936401


 97%|█████████▋| 97/100 [29:43<00:53, 17.91s/it]

Epoch 97/100, Loss: 1.2862664461135864


 98%|█████████▊| 98/100 [30:00<00:35, 17.84s/it]

Epoch 98/100, Loss: 0.9091157913208008


 99%|█████████▉| 99/100 [30:18<00:17, 17.78s/it]

Epoch 99/100, Loss: 1.2517513036727905


100%|██████████| 100/100 [30:36<00:00, 18.36s/it]

Epoch 100/100, Loss: 1.3033874034881592



