In [1]:
!pip install datasets
!pip install bpemb

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/519.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-

In [2]:
import nltk
from bpemb import BPEmb
import string
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from torch import nn
import torch

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

<a href="https://colab.research.google.com/github/Axel0087/NLP2023/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
dataset = load_dataset("copenlu/answerable_tydiqa")

Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [4]:
train_set = dataset["train"]
validation_set = dataset["validation"]

In [5]:
def get_answer_start(row):
  return row["annotations"]["answer_start"][0]

def get_answer(row):
  return row["annotations"]["answer_text"][0]

def get_document(row):
  return row["document_plaintext"]

def get_question(row):
  return row["question_text"]

def oracle(answer, document):
  return answer != "" and answer in document

def get_language(dataset, lang):
  return [row for row in dataset if row['language'] == lang]

In [6]:
train_arabic = get_language(train_set, "arabic")
val_arabic = get_language(validation_set, "arabic")

train_bengali = get_language(train_set, "bengali")
val_bengali = get_language(validation_set, "bengali")

train_indonesian = get_language(train_set, "indonesian")
val_indonesian = get_language(validation_set, "indonesian")

In [7]:
def ratio_string(train, val):
  val_ratio = round(len(val)/len(train)*100)
  train_ratio = 100-val_ratio
  return f"{train_ratio} / {val_ratio}"

def answerable_ratio(ds):
  answerable = round(sum([1 for row in ds if get_answer_start(row) == -1])/len(ds)*100)
  nonansw = 100-answerable
  return f"{answerable} / {nonansw}"

print(f"""
Dataset features:

{train_set.column_names}

Dataset sizes:

(Arabic) Training set:                                          {len(train_arabic)}
(Arabic) Validation set:                                        {len(val_arabic)}
(Arabic) Ratio (Training/Val):                                  {ratio_string(train_arabic, val_arabic)}
(Arabic) Training balance (Answerable / Not answerable):        {answerable_ratio(train_arabic)}
(Arabic) Validation balance (Answerable / Not answerable):      {answerable_ratio(val_arabic)}

(Bengali) Training set:                                         {len(train_bengali)}
(Bengali) Validation set:                                       {len(val_bengali)}
(Bengali) Ratio (Training/Val):                                 {ratio_string(train_bengali, val_bengali)}
(Bengali) Training balance (Answerable / Not answerable):       {answerable_ratio(train_bengali)}
(Bengali) Validation balance (Answerable / Not answerable):     {answerable_ratio(val_bengali)}

(Indonesian) Training set:                                      {len(train_indonesian)}
(Indonesian) Validation set:                                    {len(val_indonesian)}
(Indonesian) Ratio (Training/Val):                              {ratio_string(train_indonesian, val_indonesian)}
(Indonesian) Training balance (Answerable / Not answerable):    {answerable_ratio(train_indonesian)}
(Indonesian) Validation balance (Answerable / Not answerable):  {answerable_ratio(val_indonesian)}
""")


Dataset features:

['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']

Dataset sizes:

(Arabic) Training set:                                          29598
(Arabic) Validation set:                                        1902
(Arabic) Ratio (Training/Val):                                  94 / 6
(Arabic) Training balance (Answerable / Not answerable):        50 / 50
(Arabic) Validation balance (Answerable / Not answerable):      50 / 50

(Bengali) Training set:                                         4779
(Bengali) Validation set:                                       224
(Bengali) Ratio (Training/Val):                                 95 / 5
(Bengali) Training balance (Answerable / Not answerable):       50 / 50
(Bengali) Validation balance (Answerable / Not answerable):     50 / 50

(Indonesian) Training set:                                      11394
(Indonesian) Validation set:                                    1191
(Indonesian) Ra

In [8]:
def bag_of_words(dataset, column):
  bag = {}
  for row in dataset:
    tokens = nltk.word_tokenize(row[column])

    for token in tokens:

      if not token in bag:
        bag[token] = 0

      bag[token] += 1
      #print(bag)
  return sorted(bag.items(), key=lambda item: item[1], reverse=True)

#def sort_bag(bag):
#  return sorted(bag.items(), key=lambda item: item[1], reverse=True)

In [9]:
#arabic_doc_bow = bag_of_words(train_arabic, "document_plaintext")
#arabic_question_bow = bag_of_words(train_arabic, "question_text")
#
#bengali_doc_bow = bag_of_words(train_bengali, "document_plaintext")
#bengali_question_bow = bag_of_words(train_bengali, "question_text")
#
#indonesian_doc_bow = bag_of_words(train_indonesian, "document_plaintext")
#indonesian_question_bow = bag_of_words(train_indonesian, "question_text")

In [10]:
#print(f"""
#
#Most common words:
#
#(Arabic) Documents: {arabic_doc_bow[0:5]}
#(Arabic) Questions: {arabic_question_bow[0:5]}
#
#(Bengali) Documents: {bengali_doc_bow[0:5]}
#(Bengali) Questions: {bengali_question_bow[0:5]}
#
#(Indonesian) Documents: {indonesian_doc_bow[0:5]}
#(Indonesian) Questions: {indonesian_question_bow[0:5]}
#""")

In [11]:
def get_ratio(question, document, stop_words):
  tokens = nltk.word_tokenize(question)
  count = 0
  stripped_tokens = set(tokens) - stop_words
  for token in stripped_tokens:
    if token in document:
      count += 1
  return count/len(stripped_tokens)


def avg(lst):
  return sum(lst)/len(lst)

def get_average_ratios(training, stop_words):
  answerable_ratios = []
  nonanswerable_ratios = []
  for row in training:
    ratio = get_ratio(get_question(row), get_document(row), stop_words)
    lst = answerable_ratios if oracle(get_answer(row), get_document(row)) else nonanswerable_ratios
    lst.append(ratio)
  return avg(answerable_ratios), avg(nonanswerable_ratios)

class NaiveModel:
  def __init__(self, stop_words):
    self.stop_words = stop_words
    self.ratio = -1

  def train(self, training):
    answerable_ratio, nonanswerable_ratio = get_average_ratios(training, self.stop_words)
    self.ratio = (answerable_ratio + nonanswerable_ratio)/2

  def classify(self, question, document):
    return get_ratio(question, document, self.stop_words) > self.ratio

def evaluate(validation, model):
  res = [int(oracle(get_answer(row), get_document(row)) == model.classify(get_question(row), get_document(row))) for row in validation]
  acc = avg(res)

  ### Manual generation of confusion matrix for scores like Balanced Accuray and F-score
  #tp, fp, tn, fn = 0, 0, 0, 0
  #for row in validation:
  #  gt = oracle(get_answer(row), get_document(row))
  #  cl = model.classify(get_question(row), get_document(row))
  #  if (cl):
  #    if (gt):
  #      tp += 1
  #    else:
  #      fp += 1
  #  else:
  #    if (gt):
  #      fn += 1
  #    else:
  #      tn += 1
  #tpr = tp / (tp + fn)
  #tnr = tn / (tn + fp)
  #ba = (tpr + tnr) / 2

  print(f"Accuracy: {round(acc*100, 4)}%\n")

In [12]:
#from nltk.corpus import stopwords
#
#nltk.download('stopwords')
#
#arabic_stop_words = set(stopwords.words('indonesian')) | set(string.punctuation) | set("؟")
#bengali_stop_words = set(stopwords.words('bengali')) | set(string.punctuation)
#indonesian_stop_words = set(stopwords.words('indonesian')) | set(string.punctuation)

In [13]:
#print("Evaluating arabic:")
#
#arabic_model = NaiveModel(arabic_stop_words)
#arabic_model.train(train_arabic)
#evaluate(val_arabic, arabic_model)
#
#print("Evaluating bengali:")
#
#bengali_model = NaiveModel(bengali_stop_words)
#bengali_model.train(train_bengali)
#evaluate(val_bengali, bengali_model)
#
#print("Evaluating indonesian:")
#
#indonesian_model = NaiveModel(indonesian_stop_words)
#indonesian_model.train(train_indonesian)
#evaluate(val_indonesian, indonesian_model)

In [14]:
vocab_size = 25000
encoding_dim = 100

bpemb_ar = BPEmb(lang='ar', dim=encoding_dim, vs=vocab_size)
bpemb_bn = BPEmb(lang='bn', dim=encoding_dim, vs=vocab_size)
bpemb_in = BPEmb(lang='id', dim=encoding_dim, vs=vocab_size)


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.model


100%|██████████| 742254/742254 [00:00<00:00, 945413.74B/s] 


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9491724/9491724 [00:01<00:00, 5492614.15B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.model


100%|██████████| 863227/863227 [00:00<00:00, 1088420.24B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9517491/9517491 [00:01<00:00, 5981039.30B/s] 


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.model


100%|██████████| 650018/650018 [00:00<00:00, 830698.20B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs25000.d100.w2v.bin.tar.gz


100%|██████████| 9465922/9465922 [00:01<00:00, 5946048.42B/s]


In [15]:
train_arabic_doc = [get_document(row) for row in train_arabic]
train_arabic_question = [get_question(row) for row in train_arabic]

train_bengali_doc = [get_document(row) for row in train_bengali]
train_bengali_question = [get_question(row) for row in train_bengali]

train_indonesian_doc = [get_document(row) for row in train_indonesian]
train_indonesian_question = [get_question(row) for row in train_indonesian]

In [16]:
def get_bpemb_features(dataset, bpemb):
  return [bpemb.embed(x) for x in tqdm(dataset)]

def text_to_ids(text, tokenizer):
    input_ids = tokenizer.encode_ids_with_eos(text)
    return input_ids, len(input_ids)

def pad_input(input):
    input_ids = [i[0] for i in input]
    seq_lens = [i[1] for i in input]

    max_length = max(seq_lens)

    input_ids = [(i + [25000] * (max_length - len(i))) for i in input_ids]

    # Make sure each sample is max_length long
    assert (all(len(i) == max_length for i in input_ids))
    return torch.tensor(input_ids), torch.tensor(seq_lens)

In [17]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m143.4/179.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


In [18]:
from torch.utils.data import DataLoader, Dataset

class DatasetReader(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    row = self.data[idx]
    # Calls the text_to_batch function
    input_ids, seq_lens = text_to_ids(row, self.tokenizer)
    return input_ids, seq_lens


In [19]:
from torcheval.metrics.text import Perplexity

class LSTMNetwork(nn.Module):
    def __init__(
            self,
            pretrained_embeddings,
            vocab_size: int,
            num_layers,
            hidden_dim: int,
            dropout_rate: float = 0.1,
            bidirectional: bool = False
    ):
        super(LSTMNetwork, self).__init__()

        self.vocab_size = vocab_size

        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1)

        self.lstm = nn.LSTM(
                pretrained_embeddings.shape[1],
                hidden_dim,
                num_layers,
                batch_first=True,
                dropout=dropout_rate,
                bidirectional=bidirectional)

        self.dropout = nn.Dropout(dropout_rate)

        self.output_layer = nn.Linear(2*hidden_dim if bidirectional else hidden_dim, vocab_size)

        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):
        all_params = list(self.lstm.named_parameters()) + list(self.output_layer.named_parameters())
        for n,p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens):

        embeds = self.embeddings(inputs)

        # Pack padded: This is necessary for padded batches input to an RNN
        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.cpu(),
            batch_first=True,
            enforce_sorted=False
        )

        lstm_out, _ = self.lstm(lstm_in)

        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        targets = torch.flatten(inputs.clone())

        output = self.output_layer(lstm_out)
        logits = output.view(-1, self.vocab_size)

        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, targets)

        return (output, loss)


In [24]:
from math import exp

def train(
    model,
    dl,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device,
    patience: int = 10
):
  # Keep track of the loss and best accuracy
  best_ppl = float("inf")
  pcounter = 0

  #metric=Perplexity().to(device)

  # Iterate through epochs
  for ep in range(n_epochs):

    losses_epoch = []

    for batch in tqdm(dl):
      model.train()
      optimizer.zero_grad()

      batch = tuple(t.to(device) for t in batch)
      inputs = batch[0]
      seq_lens = batch[1]

      (output, loss) = model(inputs, seq_lens)

      losses_epoch.append(loss.item())
      #targets = inputs.clone()
      #metric.update(output, targets)

      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step()

    avg_loss = sum(losses_epoch) / len(losses_epoch)
    ppl = exp(avg_loss)
    #ppl = metric.compute()
    #metric.reset()

    print(f'Avg Loss/Perplexity at epoch {ep}: {avg_loss}/{ppl}')

    # Keep track of the best model based on the accuracy
    if ppl < best_ppl:
      torch.save(model.state_dict(), 'best_model')
      best_ppl = ppl
      pcounter = 0
    else:
      pcounter += 1
      if pcounter == patience:
        break

  model.load_state_dict(torch.load('best_model'))
  return model

In [None]:
from torch.optim import Adam

num_layers = 2
hidden_dim = 100
dropout_rate = 0.1
lr = 0.0001
n_epochs = 100
batch_size = 32

bidirectional = True

patience = 5

ds = train_bengali_question
embs = bpemb_bn

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

pretrained_embeddings = torch.Tensor(np.concatenate([embs.emb.vectors, np.zeros(shape=(1,100))], axis=0))
vocabulary = embs.emb.index_to_key + ['[PAD]']

model = LSTMNetwork(pretrained_embeddings, len(vocabulary), num_layers, hidden_dim, dropout_rate, bidirectional).to(device)

reader = DatasetReader(ds, embs)

dl = DataLoader(reader, batch_size=batch_size, collate_fn=pad_input, shuffle=True, num_workers=2)

optimizer = Adam(model.parameters(), lr=lr)

train(model, dl, optimizer, n_epochs, device, patience)

100%|██████████| 150/150 [00:02<00:00, 52.30it/s]


Avg Loss/Perplexity at epoch 0: 9.43280296643575/12491.490936798229


100%|██████████| 150/150 [00:02<00:00, 57.48it/s]


Avg Loss/Perplexity at epoch 1: 8.076638329823812/3218.3958456540913


100%|██████████| 150/150 [00:02<00:00, 62.49it/s]


Avg Loss/Perplexity at epoch 2: 7.970094337463379/2893.1302819190328


100%|██████████| 150/150 [00:02<00:00, 51.25it/s]


Avg Loss/Perplexity at epoch 3: 7.840132395426433/2540.541167593064


100%|██████████| 150/150 [00:02<00:00, 56.71it/s]


Avg Loss/Perplexity at epoch 4: 7.707832597096761/2225.7130097734916


100%|██████████| 150/150 [00:03<00:00, 49.84it/s]


Avg Loss/Perplexity at epoch 5: 7.646010882059733/2092.2825580626068


100%|██████████| 150/150 [00:02<00:00, 61.16it/s]


Avg Loss/Perplexity at epoch 6: 7.573525307973226/1945.9884120467796


100%|██████████| 150/150 [00:02<00:00, 62.18it/s]


Avg Loss/Perplexity at epoch 7: 7.487263129552205/1785.1596492181955


100%|██████████| 150/150 [00:02<00:00, 61.97it/s]


Avg Loss/Perplexity at epoch 8: 7.382683881123861/1607.8993928144575


100%|██████████| 150/150 [00:02<00:00, 58.58it/s]


Avg Loss/Perplexity at epoch 9: 7.323015410105388/1514.7647268956575


100%|██████████| 150/150 [00:02<00:00, 53.18it/s]


Avg Loss/Perplexity at epoch 10: 7.207952845891317/1350.1255213544857


100%|██████████| 150/150 [00:02<00:00, 60.29it/s]


Avg Loss/Perplexity at epoch 11: 7.152883542378744/1277.7851957639375


100%|██████████| 150/150 [00:02<00:00, 60.26it/s]


Avg Loss/Perplexity at epoch 12: 7.052097988128662/1155.2799657001447


100%|██████████| 150/150 [00:02<00:00, 61.33it/s]


Avg Loss/Perplexity at epoch 13: 6.9908186721801755/1086.610690091572


100%|██████████| 150/150 [00:02<00:00, 59.04it/s]


Avg Loss/Perplexity at epoch 14: 6.907035582860311/999.2805627973095


100%|██████████| 150/150 [00:02<00:00, 53.41it/s]


Avg Loss/Perplexity at epoch 15: 6.834762029647827/929.6071054714029


100%|██████████| 150/150 [00:02<00:00, 61.28it/s]


Avg Loss/Perplexity at epoch 16: 6.7448655764261884/849.6849013192129


100%|██████████| 150/150 [00:02<00:00, 61.04it/s]


Avg Loss/Perplexity at epoch 17: 6.658904333114624/779.696181661784


100%|██████████| 150/150 [00:03<00:00, 48.17it/s]


Avg Loss/Perplexity at epoch 18: 6.55503488222758/702.7736637062748


100%|██████████| 150/150 [00:02<00:00, 54.29it/s]


Avg Loss/Perplexity at epoch 19: 6.487604637145996/656.9478485304842


100%|██████████| 150/150 [00:02<00:00, 56.54it/s]


Avg Loss/Perplexity at epoch 20: 6.455779364903768/636.3694970863348


100%|██████████| 150/150 [00:02<00:00, 61.45it/s]


Avg Loss/Perplexity at epoch 21: 6.330498088200887/561.4361689519878


100%|██████████| 150/150 [00:02<00:00, 62.11it/s]


Avg Loss/Perplexity at epoch 22: 6.2981772740681965/543.5802088579828


100%|██████████| 150/150 [00:02<00:00, 60.68it/s]


Avg Loss/Perplexity at epoch 23: 6.202001298268636/493.73616633261526


100%|██████████| 150/150 [00:02<00:00, 52.22it/s]


Avg Loss/Perplexity at epoch 24: 6.14818367322286/467.86681553339986


100%|██████████| 150/150 [00:02<00:00, 59.61it/s]


Avg Loss/Perplexity at epoch 25: 6.061962881088257/429.2171126875134


100%|██████████| 150/150 [00:02<00:00, 62.66it/s]


Avg Loss/Perplexity at epoch 26: 5.981071124076843/395.8641607813351


100%|██████████| 150/150 [00:02<00:00, 62.60it/s]


Avg Loss/Perplexity at epoch 27: 5.9072136815389/367.6802525515768


100%|██████████| 150/150 [00:02<00:00, 61.79it/s]


Avg Loss/Perplexity at epoch 28: 5.88511448542277/359.643939628259


100%|██████████| 150/150 [00:02<00:00, 50.92it/s]


Avg Loss/Perplexity at epoch 29: 5.834468653996786/341.8830275997691


100%|██████████| 150/150 [00:02<00:00, 60.21it/s]


Avg Loss/Perplexity at epoch 30: 5.78940047899882/326.81703195685424


100%|██████████| 150/150 [00:02<00:00, 62.26it/s]


Avg Loss/Perplexity at epoch 31: 5.670971034367879/290.3163046746749


100%|██████████| 150/150 [00:02<00:00, 61.64it/s]


Avg Loss/Perplexity at epoch 32: 5.678224341074626/292.42971320898977


100%|██████████| 150/150 [00:02<00:00, 60.39it/s]


Avg Loss/Perplexity at epoch 33: 5.637535082499186/280.76979045073296


100%|██████████| 150/150 [00:02<00:00, 51.34it/s]


Avg Loss/Perplexity at epoch 34: 5.559417473475138/259.6715267042398


100%|██████████| 150/150 [00:02<00:00, 60.15it/s]


Avg Loss/Perplexity at epoch 35: 5.518520437876384/249.26595974777135


100%|██████████| 150/150 [00:02<00:00, 53.15it/s]


Avg Loss/Perplexity at epoch 36: 5.480155642827352/239.88404069917885


100%|██████████| 150/150 [00:02<00:00, 58.83it/s]


Avg Loss/Perplexity at epoch 37: 5.4365725406010945/229.65370424517127


100%|██████████| 150/150 [00:02<00:00, 58.47it/s]


Avg Loss/Perplexity at epoch 38: 5.350215473175049/210.6536831947922


100%|██████████| 150/150 [00:02<00:00, 52.54it/s]


Avg Loss/Perplexity at epoch 39: 5.340975322723389/208.7161766801249


100%|██████████| 150/150 [00:02<00:00, 61.69it/s]


Avg Loss/Perplexity at epoch 40: 5.27166708946228/194.74034155705814


100%|██████████| 150/150 [00:02<00:00, 60.83it/s]


Avg Loss/Perplexity at epoch 41: 5.244450642267863/189.51167692249908


100%|██████████| 150/150 [00:02<00:00, 62.41it/s]


Avg Loss/Perplexity at epoch 42: 5.136360362370809/170.09555414912657


  0%|          | 0/150 [00:00<?, ?it/s]