## Installing Modules

In [1]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install python-multipart
!pip install uvicorn
!pip install kaleido
!pip install fastapi
!pip install hazm

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_

In [1]:
!git clone https://github.com/AUT-NLP/PQuAD

Cloning into 'PQuAD'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 27 (delta 9), reused 15 (delta 3), pack-reused 0[K
Receiving objects: 100% (27/27), 5.71 MiB | 2.73 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import time
import collections
import matplotlib.pyplot as plt
from datasets import load_dataset, load_metric
from datasets import ClassLabel, Sequence
import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM , XLMRobertaForSequenceClassification , AutoModelForQuestionAnswering
from transformers import get_linear_schedule_with_warmup, AdamW
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm_notebook
from datasets import load_dataset
import warnings
import json
from datasets import Dataset, DatasetDict
import hazm

In [3]:
hazm_normalizer = hazm.Normalizer()

## Creating dataframe for the dataset

In [4]:
def create_df(add):
  f = open(add)
  df_data = pd.DataFrame(columns= ["id","title" ,"context" , "question" ,"answers" ])
  data = json.load(f)
  for obj in tqdm_notebook(data["data"] , total = len(data["data"])):
    title = obj["title"]
    parags = obj["paragraphs"]
    for parag in parags:
      context =  parag["context"]
      qas = parag["qas"]
      for ques in qas:
        question = ques["question"]
        id = ques["id"]
        answers = ques["answers"]
        answer_list =[]
        answer_text_list = []
        if len(answers)>0:
          for ao in answers:
            answer_list.append(ao["answer_start"])
            answer_text_list.append(ao["text"])
        ans = {'text': answer_text_list, 'answer_start': answer_list}
        df_data = df_data.append({"id":id,"title":title,"context":context,"question":question,"answers":ans},ignore_index=True)
  return df_data


In [5]:
warnings.filterwarnings("ignore", category=FutureWarning)
train_df = create_df("/content/PQuAD/Dataset/Train.json")
valid_df = create_df("/content/PQuAD/Dataset/Validation.json")
test_df = create_df("/content/PQuAD/Dataset/Test.json")
ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(valid_df)
ds["test"] = Dataset.from_pandas(test_df)

  0%|          | 0/891 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

In [6]:
def plot(loss_list, metric_list, title):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
    fig.subplots_adjust(wspace=.2)
    plotLoss(ax1, loss_list, title)
    plotAccuracy(ax2, metric_list, title)
    plt.show()


def plotLoss(ax, loss_list, title):
    ax.plot(loss_list[:, 0], label="Train_loss")
    ax.plot(loss_list[:, 1], label="Validation_loss")
    ax.set_title("Loss Curves - " + title, fontsize=12)
    ax.set_ylabel("Loss", fontsize=10)
    ax.set_xlabel("Epoch", fontsize=10)
    ax.legend(prop={'size': 10})


def plotAccuracy(ax, metric_list, title):
    ax.plot(metric_list[:, 0], label="Exact")
    ax.plot(metric_list[:, 1], label="F1")
    ax.set_title("Metrics Curves - " + title, fontsize=12)
    ax.set_ylabel("Score", fontsize=10)
    ax.set_xlabel("Epoch", fontsize=10)
    ax.legend(prop={'size': 10})

## Tokenizer Class

In [7]:
class DatasetTokenizer:

    STRIDE = 128
    LOADING_BATCH_SIZE = 1000


    def __init__(self , tokenizer , max_len , lbs = LOADING_BATCH_SIZE , stride = STRIDE  ) :
        self.tokenizer = tokenizer
        self.lbs = lbs
        self.stride = stride
        self.max_len = max_len


    def tokenize(self , dataset ) :
        for i in tqdm ( range ( 0 , dataset.shape[0] , self.lbs) ):
            mini_data = dataset[i : i + self.lbs]
            tokenized_mini_data = self.batch_tokenizer ( mini_data  , self.tokenizer  ,  self.max_len , self.stride )

            if i == 0 : encoded_inputs = tokenized_mini_data
            else :
                for key in encoded_inputs :
                    encoded_inputs[key] += tokenized_mini_data[key]
        return encoded_inputs



    def batch_tokenizer(self, inputs , tokenizer  , max_len , stride   ):

        question = inputs ['question']
        context = inputs ['context']

        # When the input length is longer than the max_length, tokenizer splits input and generate multiple features (tokenized input)
        # These features have overlap together and the stride hyperparameter determines the size of this overlap
        # features generated from an input are called span
        encoded_inputs = tokenizer(
            question,
            context,
            truncation = 'only_second',
            max_length = max_len,
            padding="max_length",
            stride = stride ,
            return_overflowing_tokens = True,
            return_offsets_mapping = True,
          )

        encoded_inputs["ans_start_pos"] = []
        encoded_inputs["ans_end_pos"] = []
        encoded_inputs["sample_id"] = []

        # map from token to character position in the original context
        # If the input is split, each span will have an offset list
        # -> for ex: the offsets list of encoded_input[0] = offset_mapping[0]
        offset_mapping = encoded_inputs["offset_mapping"]
        # Map from encoded input to its corresponding input (data sample)
        sample_mapping = encoded_inputs.pop("overflow_to_sample_mapping")

        for i, offsets in enumerate(offset_mapping):

            input_ids = encoded_inputs["input_ids"][i]
            # Index of cls_token
            cls_index = input_ids.index(tokenizer.cls_token_id)
            # sequence_ids distinguish which part of the offsets is for the question and which part is for the context
            sequence_ids = encoded_inputs.sequence_ids(i)

            context_id = 1

            sample_index = sample_mapping[i]
            encoded_inputs["sample_id"].append(inputs["id"][sample_index])
            answers = inputs["answers"][sample_index]

            # if the question has no answer, set the cls_index as the answer start and end position
            if len(answers["answer_start"]) == 0:
                encoded_inputs["ans_start_pos"].append(cls_index)
                encoded_inputs["ans_end_pos"].append(cls_index)
            else:
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

            # finds index of the first token belonging to the context bt iterating through the sequece ids
                context_first_token_idx = 0
                while sequence_ids[context_first_token_idx] != 1 :
                    context_first_token_idx += 1

            # finds index of the last token belonging to the context bt iterating through the sequece ids
                context_last_token_idx = len(input_ids) - 1
                while sequence_ids[context_last_token_idx] != 1:
                    context_last_token_idx -= 1

            # Detect if the answer exists in the span (encoded_inputs)
                if (offsets[context_first_token_idx][0] <= start_char and offsets[context_last_token_idx][1] >= end_char):
                    answer_start_index = context_first_token_idx

                    # finds index of the token asnwer starts in the context
                    while answer_start_index < len(offsets) and offsets[answer_start_index][0] <= start_char:
                        answer_start_index += 1
                    encoded_inputs["ans_start_pos"].append(answer_start_index - 1)

                    answer_end_index = context_last_token_idx
                    while offsets[answer_end_index][1] >= end_char:
                        answer_end_index -= 1
                    encoded_inputs["ans_end_pos"].append(answer_end_index + 1)

                else:
                    encoded_inputs["ans_start_pos"].append(cls_index)
                    encoded_inputs["ans_end_pos"].append(cls_index)

                # put (0,0) as an offset in the offset mapping list, if the corresponding word does not belong to the context
                encoded_inputs["offset_mapping"][i] = [
                      (offset if sequence_ids[j] == context_id else (0,0))
                      for j, offset in enumerate(encoded_inputs["offset_mapping"][i])]



        return encoded_inputs

## Data Flow

In [8]:
class Data(Dataset):

    def __init__(self, dataset , tokenizer , max_len ):
        self.dataset=dataset
        dt = DatasetTokenizer(tokenizer, max_len  )
        self.encoded_inputs = dt.tokenize ( dataset )

    def __getitem__(self,item):
        input_ids = []
        attention_mask = []
        token_type_ids = []
        ans_start_pos = []
        ans_end_pos = []
        sample_id = []
        offset_mapping= []

        for i in item:
          input_ids.append( torch.tensor(self.encoded_inputs['input_ids'][i] ))
          attention_mask.append(torch.tensor(self.encoded_inputs['attention_mask'][i] ))
          ans_start_pos.append(self.encoded_inputs['ans_start_pos'][i])
          ans_end_pos.append(self.encoded_inputs['ans_end_pos'][i])
          sample_id.append(self.encoded_inputs['sample_id'][i])
          offset_mapping.append(self.encoded_inputs['offset_mapping'][i])
          token_type_ids.append(torch.tensor( self.encoded_inputs['token_type_ids'][i] ))


        return {

            'input_ids' : input_ids,
            'attention_mask' : attention_mask,
            'ans_start_pos' : ans_start_pos,
            'ans_end_pos' : ans_end_pos,
            'sample_id' : sample_id,
            'offset_mapping' : offset_mapping,
            'token_type_ids' : token_type_ids
        }

    def __len__(self):
        return len( self.encoded_inputs['input_ids'] )

In [9]:
def getDataLoaders( tokenizer , max_len):



    train_set = ds['train']
    val_set = ds['validation']
    test_set = ds["test"]
    print('train samples = ', train_set.shape[0] , f'(batch-size = {train_bs}')
    print('validation samples = ', val_set.shape[0], f'(batch-size = {test_val_bs}')

    print('-'*40,'\nStart tokenizing trian set...')
    train_dl = torch.utils.data.DataLoader(  Data( train_set , tokenizer , max_len ) , batch_size = train_bs , shuffle = True  )

    print('-'*40,'\nStart tokenizing validation set...')
    val_dl = torch.utils.data.DataLoader(  Data( val_set , tokenizer , max_len  ) , batch_size = test_val_bs , shuffle = True  )

    print('-'*40,'\nStart tokenizing test set...')
    test_dl = torch.utils.data.DataLoader(  Data( test_set , tokenizer , max_len  ) , batch_size = test_val_bs , shuffle = True  )

    return train_dl , val_dl , val_set  ,  test_dl , test_set

## Model Architecture

In [10]:
class BaseLineNetwork(nn.Module):

    def __init__(self , model_type):
        super(BaseLineNetwork, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_type)
        self.drop = nn.Dropout(p = 0.2)
        self.fc = nn.Linear(self.transformer.config.hidden_size, 2)

    def forward(self, input_ids  ,token_type_ids , attention_mask):

        output = self.transformer(input_ids = input_ids  ,token_type_ids = token_type_ids, attention_mask = attention_mask )
        output = self.fc(output[0]) # output is tupple which its first element is hidden state of output embedding

        ans_start_pos, ans_end_pos = output.split(1, dim=-1)
        ans_start_pos = ans_start_pos.squeeze(-1)
        ans_end_pos = ans_end_pos.squeeze(-1)

        return ans_start_pos, ans_end_pos

In [11]:
def getModel(model_type):
    model = BaseLineNetwork(model_type)
    model = model.to(device)
    return model

In [12]:
def loss_function( start_true , end_true , start_pred, end_pred):
    # Ignore start/end positions if are outside of model inputs
    # clamp() method clamps all the input elements into the range [ min, max ]
    max_input_index = start_pred.size(1)
    start_true.clamp_(0, max_input_index)
    end_true.clamp_(0, max_input_index)

    ce = torch.nn.CrossEntropyLoss(ignore_index=max_input_index)
    start_loss = ce(start_pred, start_true)
    end_loss = ce(end_pred, end_true)
    return (start_loss + end_loss) / 2

## Data Postprocessing

In [13]:
def collect_predictions(batch , start_logits ,end_logits  , predictions ) :

    offset_mapping =  batch['offset_mapping']
    sample_ids = batch['sample_id']
    input_ids = batch['input_ids']
    offset_mapping0 = np.array([[t.numpy().tolist() for t in subtle][0] for subtle in offset_mapping ]) # start postion
    offset_mapping0 = offset_mapping0.T
    offset_mapping1 = np.array([[t.numpy().tolist() for t in subtle][1] for subtle in offset_mapping ]) # end position
    offset_mapping1 = offset_mapping1.T

    for i in range (len(batch['input_ids'])):

        predictions.append(
            {
            'sample_id' : sample_ids[i],
            'offset_mapping' :[offset_mapping0[i],offset_mapping1[i]],
            'input_ids' : input_ids[i],
            'end_logits' : end_logits[i],
            'start_logits' : start_logits[i] ,
        })

    return predictions

In [14]:
def postprocess_predictions(predictions , inputs , tokenizer ,  n_best_size = 20, max_answer_length = 20):

    # Map example to its corresponding features.
    sample_idToIndex = {id : index for index , id in enumerate(inputs["id"])}
    # Specifying the predictions made based on each data sample (since each sample can generate several encoded-inputs);
    pred_per_samples = collections.defaultdict(list)
    for i, pred in enumerate(predictions):
      pidx = pred["sample_id"]
      sit = sample_idToIndex[pidx]
      pred_per_samples[sit].append(i)

    final_predictions = collections.OrderedDict()

    for sample_index, sample in enumerate(inputs):

        preds_indices = pred_per_samples[sample_index]
        min_null_score = None
        valid_answers = []
        context = sample["context"]

        for preds_index in preds_indices:

            start_logits = predictions[preds_index]['start_logits']
            end_logits = predictions[preds_index]['end_logits']
            offset_mapping = predictions[preds_index]["offset_mapping"]
            cls_index = predictions[preds_index]["input_ids"].numpy().tolist().index(tokenizer.cls_token_id)
            pred_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < pred_null_score:
                min_null_score = pred_null_score

            # List of top "n_best_size" positions for the start and end position of the answer
            best_start_indexes = np.flip( np.argsort( start_logits ) ).tolist()[: n_best_size]
            best_end_indexes = np.flip( np.argsort( end_logits ) ).tolist()[: n_best_size]

            # Loop to obtain a sorted list of best answers based on the scores (summation of start and end position scores)
            for start_index in best_start_indexes:
                for end_index in best_end_indexes:
                    # If answer is valid
                    if ( not ( start_index >= len(offset_mapping) ) or not ( end_index >= len(offset_mapping) ) or
                         not ( end_index < start_index ) or not ( end_index - start_index + 1 > max_answer_length ) or
                         not (torch.all( offset_mapping[start_index] is torch.Tensor([0,0]) )) or not (torch.all( offset_mapping[end_index] is torch.Tensor([0,0]) )) or offset_mapping[end_index]):

                        start_char = offset_mapping[0][start_index]
                        end_char = offset_mapping[1][end_index]


                        valid_answer = {"score": start_logits[start_index] + end_logits[end_index], "text": context[start_char: end_char]}
                        valid_answers.append( valid_answer )

        if len(valid_answers) > 0: best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        # If model can't to predict any valid answer, create a fake prediction to avoid failure
        else: best_answer = {"text": "", "score": 0.0}

        answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
        final_predictions[sample["id"]] = answer

    return final_predictions

In [15]:
def compute_metrics ( predictions , raw_input , tokenizer) :

    # Process predictions to obtain the text of the predicted answer for each data sample
    final_predictions = postprocess_predictions(predictions , raw_input , tokenizer)
    # Reformat data (need this type of data to compute metrics)
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
    ground_trouth = [{"id": sample["id"], "answers": sample["answers"]} for sample in raw_input]

    metric = load_metric("squad_v2" )
    return metric.compute(predictions=formatted_predictions, references = ground_trouth)

## Training Model Class

In [16]:
def train(model , dataloader , optimizer , criterion  , epoch ) :

    #put the model on train mode
    model.train()
    losses =  []
    print(len(dataloader))
    for iteration , batch in enumerate(tqdm_notebook(dataloader)) :

        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        answer_sp = batch['ans_start_pos'].to(device)
        answer_ep = batch['ans_end_pos'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        start , end  = model(input_ids = input_ids ,token_type_ids=token_type_ids, attention_mask = attention_mask  )

        loss = criterion(answer_sp, answer_ep , start , end)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()

        if iteration % round((len(dataloader) / 5)) == 0:
            print( f'[Epoch][Batch] = [{epoch+1}][{iteration}] -> Loss = {np.mean(losses):.4f}')

    return  np.mean(losses)



def evaluate(model , dataloader , criterion , raw_input , tokenizer ) :

    #put the model on evaluation mode
    model.eval()
    losses , predictions = [],[]

    for batch in dataloader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        answer_sp = batch['ans_start_pos'].to(device)
        answer_ep = batch['ans_end_pos'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        with torch.no_grad():
          start_logits , end_logits  = model(input_ids = input_ids ,token_type_ids=token_type_ids, attention_mask = attention_mask  )
        predictions = collect_predictions(batch  , start_logits.cpu().detach().numpy() ,
                                          end_logits.cpu().detach().numpy() , predictions )

        loss = criterion(answer_sp, answer_ep , start_logits , end_logits)
        losses.append(loss.item())

    metrics = compute_metrics(predictions , raw_input , tokenizer)
    return  np.mean(losses) , metrics['exact'] , metrics['f1']


In [17]:
def trainModel (_model , max_length  , lr , weight_decay ,epochs, sch_gamma , sch_step ,title,model) :

    tokenizer  =  AutoTokenizer.from_pretrained(_model)
    train_dataloader , val_dataloader , val_set , test_dataloader , test_set  = getDataLoaders( tokenizer , max_length )

    print('-'*40)
    print('Number of train batches =',len(train_dataloader))
    print('Number of validaion batches =',len(val_dataloader))
    #print('# of test batches =',len(test_dataloader)  )
    print('-'*40)



    #Determine the type of : optimizer, scheduling and loss
    optimizer  = AdamW(model.parameters(), lr=5e-5,weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = sch_step, gamma = sch_gamma)
    criterion = loss_function

    best_accuracy = 0
    loss_list , metric_list =[],[]

    print('-' * 40,'\nStart Training ....\n')
    for epock in range(epochs):

        train_loss = train( model , train_dataloader , optimizer , criterion , epock )
        val_loss , exact_score , f1_score = evaluate( model , val_dataloader , criterion , val_set , tokenizer )
        scheduler.step()

        loss_list.append([train_loss , val_loss])
        metric_list.append([exact_score , f1_score])

        print(f'\tTrain -> Loss = {train_loss:.4f}')
        print(f'\tValidation -> Loss = {val_loss:.4f} / Exact_score = {exact_score:.3f} - F1_score = {f1_score:.3f}', '\n')


    val_loss , exact_score , f1_score = evaluate( model , test_dataloader , criterion , test_set , tokenizer )
    print(f'\tTest -> Loss = {val_loss:.4f} / Exact_score = {exact_score:.3f} - F1_score = {f1_score:.3f}', '\n')


    plot(np.array(loss_list),np.array(metric_list),title)

In [None]:
epochs = 3
learning_rate = 5e-5
train_bs = 16
test_val_bs = 4
max_length = 512
sch_gamma = 0.1
weight_decay=0.1
sch_step = 5
max_answer_length = 20
bert_model = "HooshvareLab/bert-base-parsbert-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device,'is available')
model = getModel(bert_model)
trainModel(bert_model , 512 ,  learning_rate, weight_decay , epochs , sch_gamma , sch_step, ' dataset name ',model)

cuda is available


config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

train samples =  63994 (batch-size = 16
validation samples =  7976 (batch-size = 4
---------------------------------------- 
Start tokenizing trian set...


100%|██████████| 64/64 [00:58<00:00,  1.10it/s]


---------------------------------------- 
Start tokenizing validation set...


100%|██████████| 8/8 [00:06<00:00,  1.19it/s]


---------------------------------------- 
Start tokenizing test set...


100%|██████████| 9/9 [00:09<00:00,  1.02s/it]


----------------------------------------
Number of train batches = 4000
Number of validaion batches = 1994
----------------------------------------
---------------------------------------- 
Start Training ....

4000


  0%|          | 0/4000 [00:00<?, ?it/s]

[Epoch][Batch] = [1][0] -> Loss = 6.2580
[Epoch][Batch] = [1][800] -> Loss = 1.3063
[Epoch][Batch] = [1][1600] -> Loss = 1.1325
[Epoch][Batch] = [1][2400] -> Loss = 1.0558
[Epoch][Batch] = [1][3200] -> Loss = 1.0075


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

	Train -> Loss = 0.9739
	Validation -> Loss = 0.8367 / Exact_score = 68.932 - F1_score = 82.819 

4000


  0%|          | 0/4000 [00:00<?, ?it/s]

[Epoch][Batch] = [2][0] -> Loss = 0.4943
[Epoch][Batch] = [2][800] -> Loss = 0.5592
[Epoch][Batch] = [2][1600] -> Loss = 0.5642
[Epoch][Batch] = [2][2400] -> Loss = 0.5776
[Epoch][Batch] = [2][3200] -> Loss = 0.5863
	Train -> Loss = 0.5951
	Validation -> Loss = 0.8820 / Exact_score = 69.145 - F1_score = 82.718 

4000


  0%|          | 0/4000 [00:00<?, ?it/s]

[Epoch][Batch] = [3][0] -> Loss = 0.2169
[Epoch][Batch] = [3][800] -> Loss = 0.3836
[Epoch][Batch] = [3][1600] -> Loss = 0.3970
