# 1, Import Dataset

In [1]:
# # IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# # THEN FEEL FREE TO DELETE THIS CELL.
# # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# # NOTEBOOK.
# import kagglehub
# leo040802_ktvic_dataset_path = kagglehub.dataset_download('leo040802/ktvic-dataset')

# print('Data source import complete.')


In [2]:
leo040802_ktvic_dataset_path = "/kaggle/input/ktvic-dataset"

# 2, Import Libraries

In [3]:
# Import libraries
import torch
import os
import json
import itertools
import nltk
import numpy as np
import wandb
from PIL import Image
from itertools import count
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from transformers import ViTModel, ViTFeatureExtractor, GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from tqdm import tqdm
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from typing import List

2025-05-07 00:03:55.760554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746576235.943019      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746576235.998892      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# print(os.listdir(os.path.join(leo040802_ktvic_dataset_path, 'ktvic_dataset')))

# 3, Configurations

In [5]:
# Split Train & Test Set
train_img_folder = os.path.join(leo040802_ktvic_dataset_path, 'ktvic_dataset/train-images')
train_json_path = os.path.join(leo040802_ktvic_dataset_path, 'ktvic_dataset/train_data.json')
test_img_folder = os.path.join(leo040802_ktvic_dataset_path, 'ktvic_dataset/public-test-images')
test_json_path = os.path.join(leo040802_ktvic_dataset_path, 'ktvic_dataset/test_data.json')

# Set Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
# Hyperparams
BATCH_SIZE = 16
NUM_EPOCHS = 50
PROJECTION_ONLY_EPOCHS = 5
LR = 0.001
EMBED_SIZE = 512
ENCODER_MODEL = 'google/vit-base-patch16-224-in21k'
DECODER_MODEL = 'NlpHUST/gpt2-vietnamese'

In [7]:
os.makedirs("./checkpoint", exist_ok=True)
os.makedirs("./checkpoint/phase1", exist_ok=True)
os.makedirs("./checkpoint/phase2", exist_ok=True)

# 4, Vocab

In [8]:
class Vocab:
    def __init__(self, texts: List[str]):
        words = list(itertools.chain(*[text.split(" ") for text in texts]))
        counter = Counter(words)
        special_tokens = ["<bos>", "<eos>", "<pad>", "<unk>"]

        self.word2idx = {key: i for i, key in zip(count(start=4), counter.keys())}
        self.word2idx.update({key: i for i, key in enumerate(special_tokens)})
        self.idx2word = {i: key for key, i in self.word2idx.items()}
        self.max_seq_len = 256

    def __len__(self):
        return len(self.word2idx)

    def get_word2idx(self):
        return self.word2idx

    def get_idx2word(self):
        return self.idx2word

# 5, Dataset

In [9]:
class KTVICDataset(Dataset):
    def __init__(self, json_data_path, imgs_folder_path, caps_per_img=5):
        with open(json_data_path, "r") as f:
            json_data = json.load(f)
        imgs_path = sorted(json_data["images"], key=lambda x: x["filename"])
        labels_path = json_data["annotations"]
        f.close()

        self.imgs_folder_path = imgs_folder_path
        self.vocab = Vocab(texts=[item["segment_caption"] for item in labels_path])

        self.data = []
        for img_item in imgs_path:
            img_path = os.path.join(imgs_folder_path, img_item["filename"])
            img_id = img_item["id"]

            label_count = 0
            labels = []
            for label_item in labels_path:
                if label_item["image_id"] == img_id:
                    label = label_item["segment_caption"]
                    labels.append(label)
                    label_count += 1
                if label_count == caps_per_img:
                    break
            self.data.extend([(img_path, label) for label in labels])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        img = Image.open(os.path.join(self.imgs_folder_path, img_path)).convert("RGB")
        return img, label

# 6, Collator

In [10]:
class Collator:
    def __init__(self, vocab, vit_model='google/vit-base-patch16-224-in21k', tokenizer_model='NlpHUST/gpt2-vietnamese') -> None:
        self.vocab = vocab
        self.bos_id = self.vocab.get_word2idx()["<bos>"]
        self.eos_id = self.vocab.get_word2idx()["<eos>"]
        self.pad_id = self.vocab.get_word2idx()["<pad>"]
        self.vit_model = ViTFeatureExtractor.from_pretrained(vit_model)
        self.tokenizer_model = GPT2Tokenizer.from_pretrained(tokenizer_model)

        # Add special tokens
        special_tokens = {
            "pad_token": "<pad>",
            "bos_token": "<bos>",
            "eos_token": "<eos>",
            "unk_token": "<unk>"
        }
        self.tokenizer_model.add_special_tokens(special_tokens)

    def get_tokenizer_dim(self):
        return len(self.tokenizer_model)


    def tokenize_texts(self, captions):
        processed_captions = [
            self.tokenizer_model.bos_token + " " + caption + " " + self.tokenizer_model.eos_token
            for caption in captions
        ]
        encoding = self.tokenizer_model(
            processed_captions,
            padding=True,
            truncation=True,
            max_length=self.vocab.max_seq_len,
            return_tensors="pt"
        )
        return encoding["input_ids"], encoding["attention_mask"]


    def transform_img(self, images):
        inputs = self.vit_model(images=images, return_tensors="pt")
        return inputs.pixel_values.to(device)

    def __call__(self, batch):
        images = [item[0] for item in batch]
        captions = [item[1] for item in batch]
        images = self.transform_img(images)
        captions, attention_mask = self.tokenize_texts(captions)
        return {"images": images, "captions": captions, "attention_mask": attention_mask}

# 7, Model

In [11]:
class Encoder(nn.Module):
    def __init__(self, model_name="google/vit-base-patch16-224-in21k"):
        super(Encoder, self).__init__()

        # Get the pretrained VIT model
        self.model = ViTModel.from_pretrained(model_name)

        # Freeze the model
        for param in self.model.parameters():
            param.requires_grad = False

        # Get hidden dimension
        self.hid_dim = self.model.config.hidden_size

    def forward(self, images):
        with torch.no_grad():
            outputs = self.model(images)

        # Get the CLS Token
        cls_token = outputs.last_hidden_state[:, 0, :]
        return cls_token

In [12]:
class Decoder(nn.Module):
    def __init__(self, tokenizer_dim, model_name="NlpHUST/gpt2-vietnamese"):
        super(Decoder, self).__init__()

        # Setup cross attention
        config = GPT2Config.from_pretrained(model_name)
        config.add_cross_attention = True

        # Load the model & tokenizer
        self.model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

        # Resize decoder embedding dim
        self.model.resize_token_embeddings(tokenizer_dim)

        # Get decoder hidden dim
        self.hid_dim = self.model.config.n_embd

    def freeze_decoder(self):
        for param in self.model.parameters():
            param.requires_grad = False

    def unfreeze_decoder(self):
        for param in self.model.parameters():
            param.requires_grad = True

    def forward(self, encoder_output, captions, attention_mask):
        outputs = self.model(
            input_ids=captions,
            labels=captions,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_output.unsqueeze(1), # Add sequence dimension
        )
        return outputs

    def generate_caption(self, encoder_output, tokenizer, max_length=50,
                         temperature=1.0, top_k=50, top_p=0.95,
                         num_return_sequences=1):
        """
        Generate captions from encoded image features

        Args:
            encoder_output: Encoded image features
            tokenizer: GPT2Tokenizer instance
            max_length: Maximum length of generated caption
            temperature: Sampling temperature (higher = more diverse)
            top_k: Number of highest probability tokens to consider
            top_p: Cumulative probability cutoff for nucleus sampling
            num_return_sequences: Number of captions to generate
        """
        # Prepare encoder output for generation
        encoder_hidden_states = encoder_output.unsqueeze(1)

        # Start with BOS token
        input_ids = torch.tensor([[tokenizer.bos_token_id]]).to(encoder_output.device)

        # Generate captions
        output_sequences = self.model.generate(
            input_ids=input_ids,
            encoder_hidden_states=encoder_hidden_states,
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        # Decode generated captions
        captions = []
        for sequence in output_sequences:
            # Remove special tokens and decode
            caption = tokenizer.decode(sequence, skip_special_tokens=True)
            captions.append(caption)

        return captions


In [13]:
class VITGPT2ImageCaptioning(nn.Module):
    def __init__(self, tokenizer_dim, embed_size=512, encoder_name="google/vit-base-patch16-224-in21k", decoder_name="NlpHUST/gpt2-vietnamese"):
        super(VITGPT2ImageCaptioning, self).__init__()

        # Initialize encoder & decoder
        self.encoder = Encoder(encoder_name)
        self.decoder = Decoder(tokenizer_dim, decoder_name)

        # Get hidden dims of encoder & decoder
        encoder_hid_dim = self.encoder.hid_dim
        decoder_hid_dim = self.decoder.hid_dim

        # Initialize projection layer
        self.projection = nn.Sequential(
            nn.Linear(encoder_hid_dim, embed_size),
            nn.ReLU(),
            nn.Linear(embed_size, decoder_hid_dim)
        )

    def forward(self, images, captions, attention_mask):
        x = self.encoder(images)
        x = self.projection(x)
        out = self.decoder(x, captions, attention_mask)
        return out

    def generate_caption(self, image, max_length=50, temperature=1.0,
                         top_k=50, top_p=0.95, num_return_sequences=1) -> List[str]:
        self.eval()
        with torch.no_grad():
            # Encode image
            image_features = self.encoder(image)

            # Project image features to embedding dim
            image_embedding = self.projection(image_features)

            # Generate caption using GPT-2 decoder
            captions = self.decoder.generate_caption(
                image_embedding,
                max_length=max_length,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                num_return_sequences=num_return_sequences
            )

            return captions

# 8, Training

In [14]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    torch.save(state, filename)

In [15]:
def load_checkpoint(device, file_name="my_checkpoint.pth.tar"):
    checkpoint = torch.load(file_name, map_location=device)
    return checkpoint

In [16]:
def train_epoch(model, dataloader, optimizer, device, epoch=None):
    model.train()
    total_loss = 0

    desc = f"Epoch {epoch}" if epoch else "Training"
    for batch in tqdm(dataloader, desc=desc):
        # Move data to device
        images = batch["images"].to(device)
        captions = batch["captions"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model(images, captions, attention_mask)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [17]:
def evaluate(model, dataloader, device, epoch=None):
    model.eval()
    total_loss = 0

    desc = f"Epoch {epoch}" if epoch else "Evaluating"
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=desc):
            # Move data to device
            images = batch["images"].to(device)
            captions = batch["captions"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass
            outputs = model(images, captions, attention_mask)
            loss = outputs.loss

            total_loss += loss.item()

    return total_loss / len(dataloader)

In [18]:
# Initialize dataset
train_dataset = KTVICDataset(train_json_path, train_img_folder)

# Create validation split from training data
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# Create collator using the vocabulary from the training dataset
collator = Collator(train_dataset.dataset.vocab)
tokenizer_dim = collator.get_tokenizer_dim()

# Create dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collator
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collator
)

# Initialize model
model = VITGPT2ImageCaptioning(
    tokenizer_dim=collator.get_tokenizer_dim(),
    encoder_name=ENCODER_MODEL,
    decoder_name=DECODER_MODEL,
    embed_size=EMBED_SIZE
).to(device)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/854k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/512k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at NlpHUST/gpt2-vietnamese and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'tr

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [19]:
# Login to wandb
!wandb login d51fe1c5d8de52366edde886e7d46cbce99e9f64

# Initialize the project
wandb.init(
    project="VITGPT2ImageCaptioning",
    config={
        "lr": LR,
        "batch_size": BATCH_SIZE,
        "embed_size": EMBED_SIZE,
        "encoder_model": ENCODER_MODEL,
        "decoder_model": DECODER_MODEL,
        "num_epochs": NUM_EPOCHS,
        "projection_only_epochs": PROJECTION_ONLY_EPOCHS,
    }
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Currently logged in as: [33manvu1204[0m ([33mAnVH225467[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Tracking run with wandb version 0.19.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250507_000421-7i61skow[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfiery-butterfly-5[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/AnVH225467/VITGPT2ImageCaptioning[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/AnVH225467/VITGPT2ImageCaptioning/runs/7i61skow[0m


In [20]:
# Phase 1: Train only the projection layer
print("===== Phase 1: Training only the projection layer =====")

# Freeze the decoder
model.decoder.freeze_decoder()

# Initialize optimizer
optimizer_phase1 = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)

# Train for PROJECTION_ONLY_EPOCHS
for epoch in range(1, PROJECTION_ONLY_EPOCHS + 1):
    # Get train loss
    train_loss = train_epoch(model, train_dataloader, optimizer_phase1, device, epoch)

    # Get val loss
    val_loss = evaluate(model, val_dataloader, device, epoch)

    # Log the loss to wandb
    wandb.log({"Train Loss (Projection Only)": train_loss, "Val Loss (Projection Only)": val_loss}, step=epoch)
    print(f"Epoch {epoch}: Train Loss = {train_loss} \| Val Loss = {val_loss}")

    # Save checkpoint
    if epoch == PROJECTION_ONLY_EPOCHS:
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer_phase1.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'phase': 'projection_only'
        }
        save_checkpoint(
            state=checkpoint,
            filename=f"./checkpoint/phase1/checkpoint_epoch_{epoch}.pth.tar"
        )
        print(f"Epoch {epoch}: checkpoint saved!")

===== Phase 1: Training only the projection layer =====


Epoch 1:   0%|          | 0/1060 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1: 100%|██████████| 1060/1060 [07:02<00:00,  2.51it/s]
Epoch 1: 100%|██████████| 118/118 [00:41<00:00,  2.82it/s]


Epoch 1: Train Loss = 4.3158483179110405 \| Val Loss = 3.797250309232938


Epoch 2: 100%|██████████| 1060/1060 [06:50<00:00,  2.58it/s]
Epoch 2: 100%|██████████| 118/118 [00:41<00:00,  2.83it/s]


Epoch 2: Train Loss = 3.7212965747095503 \| Val Loss = 3.6614020699161594


Epoch 3: 100%|██████████| 1060/1060 [06:51<00:00,  2.58it/s]
Epoch 3: 100%|██████████| 118/118 [00:41<00:00,  2.83it/s]


Epoch 3: Train Loss = 3.62882009519721 \| Val Loss = 3.5939569917775818


Epoch 4: 100%|██████████| 1060/1060 [06:50<00:00,  2.58it/s]
Epoch 4: 100%|██████████| 118/118 [00:41<00:00,  2.86it/s]


Epoch 4: Train Loss = 3.571370860990488 \| Val Loss = 3.555117514173863


Epoch 5: 100%|██████████| 1060/1060 [06:48<00:00,  2.60it/s]
Epoch 5: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 5: Train Loss = 3.5293865568232987 \| Val Loss = 3.521142533269979
Epoch 5: checkpoint saved!


In [21]:
# Phase 2: Train both projection layer and decoder
print("\n===== Phase 2: Training both projection layer and decoder =====")

# Unfreeze the decoder
model.decoder.unfreeze_decoder()

# Initialize optimizer
optimizer_phase2 = optim.Adam(model.parameters(), lr=LR)

# Train for the remaining epochs
for epoch in range(PROJECTION_ONLY_EPOCHS + 1, NUM_EPOCHS + 1):
    # Get train loss
    train_loss = train_epoch(model, train_dataloader, optimizer_phase2, device, epoch)

    # Get val loss
    val_loss = evaluate(model, val_dataloader, device, epoch)

    # Log the loss to wandb
    wandb.log({"Train Loss (Projection & Decoder)": train_loss, "Val Loss (Projection & Decoder)": val_loss}, step=epoch)
    print(f"Epoch {epoch}: Train Loss = {train_loss} \| Val Loss = {val_loss}")

    # Save checkpoint
    if epoch % 10 == 0:
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer_phase2.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'phase': 'projection_decoder'
        }
        save_checkpoint(
            state=checkpoint,
            filename=f"./checkpoint/phase2/checkpoint_epoch_{epoch}.pth.tar"
        )
        print(f"Epoch {epoch}: checkpoint saved!")


===== Phase 2: Training both projection layer and decoder =====


Epoch 6: 100%|██████████| 1060/1060 [07:29<00:00,  2.36it/s]
Epoch 6: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 6: Train Loss = 2.227369696632871 \| Val Loss = 1.0985436884023376


Epoch 7: 100%|██████████| 1060/1060 [07:34<00:00,  2.33it/s]
Epoch 7: 100%|██████████| 118/118 [00:41<00:00,  2.83it/s]


Epoch 7: Train Loss = 0.9588752067314004 \| Val Loss = 0.9553968345714827


Epoch 8: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 8: 100%|██████████| 118/118 [00:41<00:00,  2.83it/s]


Epoch 8: Train Loss = 0.8014872874010284 \| Val Loss = 0.9307811497631719


Epoch 9: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 9: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 9: Train Loss = 0.6955124510065565 \| Val Loss = 0.9217589209645481


Epoch 10: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 10: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 10: Train Loss = 0.607483094733841 \| Val Loss = 0.9563440347121934
Epoch 10: checkpoint saved!


Epoch 11: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 11: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 11: Train Loss = 0.539876319439906 \| Val Loss = 0.9928484094344964


Epoch 12: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 12: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 12: Train Loss = 0.4864483473154734 \| Val Loss = 1.0106442313073045


Epoch 13: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 13: 100%|██████████| 118/118 [00:42<00:00,  2.81it/s]


Epoch 13: Train Loss = 0.4389141222778356 \| Val Loss = 1.0265368359573817


Epoch 14: 100%|██████████| 1060/1060 [07:33<00:00,  2.34it/s]
Epoch 14: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 14: Train Loss = 0.4015651268216799 \| Val Loss = 1.069795436273187


Epoch 15: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 15: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 15: Train Loss = 0.36923191848228565 \| Val Loss = 1.0974101313089921


Epoch 16: 100%|██████████| 1060/1060 [07:31<00:00,  2.35it/s]
Epoch 16: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 16: Train Loss = 0.3405117831140194 \| Val Loss = 1.1054821812500388


Epoch 17: 100%|██████████| 1060/1060 [07:31<00:00,  2.35it/s]
Epoch 17: 100%|██████████| 118/118 [00:41<00:00,  2.82it/s]


Epoch 17: Train Loss = 0.3209096108686249 \| Val Loss = 1.1329792247990431


Epoch 18: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 18: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 18: Train Loss = 0.2984967601046247 \| Val Loss = 1.16562648698435


Epoch 19: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 19: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 19: Train Loss = 0.2846477975682268 \| Val Loss = 1.2013426721096039


Epoch 20: 100%|██████████| 1060/1060 [07:33<00:00,  2.34it/s]
Epoch 20: 100%|██████████| 118/118 [00:41<00:00,  2.83it/s]


Epoch 20: Train Loss = 0.2685565985176923 \| Val Loss = 1.2389914903600336
Epoch 20: checkpoint saved!


Epoch 21: 100%|██████████| 1060/1060 [07:33<00:00,  2.34it/s]
Epoch 21: 100%|██████████| 118/118 [00:42<00:00,  2.81it/s]


Epoch 21: Train Loss = 0.2598341254130849 \| Val Loss = 1.260661999552937


Epoch 22: 100%|██████████| 1060/1060 [07:33<00:00,  2.34it/s]
Epoch 22: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 22: Train Loss = 0.24836922920537444 \| Val Loss = 1.2377525012371904


Epoch 23: 100%|██████████| 1060/1060 [07:32<00:00,  2.34it/s]
Epoch 23: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 23: Train Loss = 0.23839573347062434 \| Val Loss = 1.3095741984197649


Epoch 24: 100%|██████████| 1060/1060 [07:29<00:00,  2.36it/s]
Epoch 24: 100%|██████████| 118/118 [00:40<00:00,  2.88it/s]


Epoch 24: Train Loss = 0.22946532408724415 \| Val Loss = 1.3404346987352533


Epoch 25: 100%|██████████| 1060/1060 [07:26<00:00,  2.38it/s]
Epoch 25: 100%|██████████| 118/118 [00:40<00:00,  2.90it/s]


Epoch 25: Train Loss = 0.22496544386980669 \| Val Loss = 1.3056425856331648


Epoch 26: 100%|██████████| 1060/1060 [07:25<00:00,  2.38it/s]
Epoch 26: 100%|██████████| 118/118 [00:40<00:00,  2.88it/s]


Epoch 26: Train Loss = 0.21669097624297412 \| Val Loss = 1.319566240250054


Epoch 27: 100%|██████████| 1060/1060 [07:28<00:00,  2.36it/s]
Epoch 27: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 27: Train Loss = 0.21164254341625943 \| Val Loss = 1.3213186582266274


Epoch 28: 100%|██████████| 1060/1060 [07:30<00:00,  2.35it/s]
Epoch 28: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 28: Train Loss = 0.20503770862969586 \| Val Loss = 1.3635998272289664


Epoch 29: 100%|██████████| 1060/1060 [07:36<00:00,  2.32it/s]
Epoch 29: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 29: Train Loss = 0.19871687660661508 \| Val Loss = 1.3916409318729983


Epoch 30: 100%|██████████| 1060/1060 [07:29<00:00,  2.36it/s]
Epoch 30: 100%|██████████| 118/118 [00:41<00:00,  2.86it/s]


Epoch 30: Train Loss = 0.1938698662128651 \| Val Loss = 1.3712096971980596
Epoch 30: checkpoint saved!


Epoch 31: 100%|██████████| 1060/1060 [07:31<00:00,  2.35it/s]
Epoch 31: 100%|██████████| 118/118 [00:41<00:00,  2.84it/s]


Epoch 31: Train Loss = 0.1912962638474298 \| Val Loss = 1.390366791668585


Epoch 32: 100%|██████████| 1060/1060 [07:34<00:00,  2.33it/s]
Epoch 32: 100%|██████████| 118/118 [00:42<00:00,  2.81it/s]


Epoch 32: Train Loss = 0.19042415014415418 \| Val Loss = 1.405762446128716


Epoch 33: 100%|██████████| 1060/1060 [07:30<00:00,  2.35it/s]
Epoch 33: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 33: Train Loss = 0.18843779775472183 \| Val Loss = 1.4076144528591026


Epoch 34: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 34: 100%|██████████| 118/118 [00:40<00:00,  2.90it/s]


Epoch 34: Train Loss = 0.17986662215641086 \| Val Loss = 1.4568201167098547


Epoch 35: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 35: 100%|██████████| 118/118 [00:40<00:00,  2.89it/s]


Epoch 35: Train Loss = 0.1770909449612757 \| Val Loss = 1.471239988581609


Epoch 36: 100%|██████████| 1060/1060 [07:26<00:00,  2.37it/s]
Epoch 36: 100%|██████████| 118/118 [00:40<00:00,  2.88it/s]


Epoch 36: Train Loss = 0.1750187662342247 \| Val Loss = 1.5128867762573694


Epoch 37: 100%|██████████| 1060/1060 [07:29<00:00,  2.36it/s]
Epoch 37: 100%|██████████| 118/118 [00:41<00:00,  2.85it/s]


Epoch 37: Train Loss = 0.17468140694611478 \| Val Loss = 1.4899899550413682


Epoch 38: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 38: 100%|██████████| 118/118 [00:40<00:00,  2.90it/s]


Epoch 38: Train Loss = 0.17119085807001816 \| Val Loss = 1.4999893918886023


Epoch 39: 100%|██████████| 1060/1060 [07:24<00:00,  2.38it/s]
Epoch 39: 100%|██████████| 118/118 [00:40<00:00,  2.92it/s]


Epoch 39: Train Loss = 0.16890517375519815 \| Val Loss = 1.5150963401390334


Epoch 40: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 40: 100%|██████████| 118/118 [00:40<00:00,  2.89it/s]


Epoch 40: Train Loss = 0.16340504888937157 \| Val Loss = 1.5127021160166143
Epoch 40: checkpoint saved!


Epoch 41: 100%|██████████| 1060/1060 [07:26<00:00,  2.37it/s]
Epoch 41: 100%|██████████| 118/118 [00:40<00:00,  2.90it/s]


Epoch 41: Train Loss = 0.1649390966051592 \| Val Loss = 1.5737646777751082


Epoch 42: 100%|██████████| 1060/1060 [07:25<00:00,  2.38it/s]
Epoch 42: 100%|██████████| 118/118 [00:41<00:00,  2.82it/s]


Epoch 42: Train Loss = 0.1616003568934382 \| Val Loss = 1.5822713627653606


Epoch 43: 100%|██████████| 1060/1060 [07:29<00:00,  2.36it/s]
Epoch 43: 100%|██████████| 118/118 [00:41<00:00,  2.82it/s]


Epoch 43: Train Loss = 0.1639874037425473 \| Val Loss = 1.507732639373359


Epoch 44: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 44: 100%|██████████| 118/118 [00:41<00:00,  2.87it/s]


Epoch 44: Train Loss = 0.15964669403180762 \| Val Loss = 1.5411064048944894


Epoch 45: 100%|██████████| 1060/1060 [07:28<00:00,  2.36it/s]
Epoch 45: 100%|██████████| 118/118 [00:40<00:00,  2.89it/s]


Epoch 45: Train Loss = 0.15720281998082153 \| Val Loss = 1.5565635678121599


Epoch 46: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 46: 100%|██████████| 118/118 [00:41<00:00,  2.87it/s]


Epoch 46: Train Loss = 0.15421190405932236 \| Val Loss = 1.5810822369688649


Epoch 47: 100%|██████████| 1060/1060 [07:27<00:00,  2.37it/s]
Epoch 47: 100%|██████████| 118/118 [00:40<00:00,  2.89it/s]


Epoch 47: Train Loss = 0.15419077954764637 \| Val Loss = 1.6021699102248175


Epoch 48: 100%|██████████| 1060/1060 [07:25<00:00,  2.38it/s]
Epoch 48: 100%|██████████| 118/118 [00:40<00:00,  2.90it/s]


Epoch 48: Train Loss = 0.15550415314874558 \| Val Loss = 1.587506284148006


Epoch 49: 100%|██████████| 1060/1060 [07:29<00:00,  2.36it/s]
Epoch 49: 100%|██████████| 118/118 [00:40<00:00,  2.89it/s]


Epoch 49: Train Loss = 0.15067815768690604 \| Val Loss = 1.6278363643056255


Epoch 50: 100%|██████████| 1060/1060 [07:30<00:00,  2.35it/s]
Epoch 50: 100%|██████████| 118/118 [00:41<00:00,  2.81it/s]


Epoch 50: Train Loss = 0.1472419727101641 \| Val Loss = 1.6284550893104683
Epoch 50: checkpoint saved!
