# Imports

In [None]:
import torch
from torch import optim
from torchvision import transforms
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import sys

In [None]:
# Adiciona o diretório pai (raiz do projeto) ao path do sistema
sys.path.append(os.path.abspath(".."))

# ativar autoreload -> capturar mudanças do código fonte
%load_ext autoreload
%autoreload 2

from src.dataset import Vocabulary, FlickrDataset, build_glove_matrix, preprocess_data
from src.trainer import ImageCaptionTrainer
from src.model import ImageCaptionModel, ScratchGRU, PreTrainedMobileNetV3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


NameError: name 'Vocabulary' is not defined

# Preprocess Data

## Data Splitting

In [None]:
df = pd.read_csv("../data/flickr8k_cleaned_data.csv")

# Primeiro split: Treino vs Resto (Validação + Teste)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Segundo split: Divide o resto entre Validação e Teste
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
train_captions = train_df['caption_clean'].tolist()

min_freq=1

vocab = Vocabulary(min_freq)
vocab.build_vocabulary(train_captions)

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),         
    # Média e desvio padrão para cada canal de cor do imagenet
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) 
])

batch_size = 4
max_tokens = 50
image_path = "../data/raw/Images"

train_dataset = FlickrDataset(image_path, train_df, vocab, transform=transform, max_tokens=max_tokens)
val_dataset = FlickrDataset(image_path, val_df, vocab, transform=transform, max_tokens=max_tokens)
test_dataset = FlickrDataset(image_path, test_df, vocab, transform=transform, max_tokens=max_tokens)

train_loader, val_loader, test_loader = preprocess_data(train_dataset, val_dataset, test_dataset, batch_size)

# Model Setup

In [None]:
# Model Configuration -> Ainda vamos mudar aqui
EMBED_SIZE = 100
HIDDEN_SIZE = 100
NUM_LAYERS = 2
DROPOUT = 0.5
EPOCHS = 20
PATIENCE = 5
LEARNING_RATE = 3e-4

In [None]:
# CNN Encoder (MobileNetV3)
encoder = PreTrainedMobileNetV3(
    dropout_rate=DROPOUT, 
    embed_size=EMBED_SIZE, 
    fine_tune=True
)

# RNN Decoder (GRU)
decoder = ScratchGRU(
    embed_size=EMBED_SIZE,
    num_layers=NUM_LAYERS,
    hidden_size=HIDDEN_SIZE,
    dropout_rate=DROPOUT,
    vocab=vocab
)

model = ImageCaptionModel(cnn=encoder, rnn=decoder)

NameError: name 'PreTrainedMobileNetV3' is not defined

## Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

trainer = ImageCaptionTrainer(
    model=model, 
    optimizer=optimizer, 
    device=device
)

In [None]:
# Start training with Early Stopping

trainer.fit(
    train_loader=train_loader, 
    val_loader=val_loader, 
    epochs=EPOCHS, 
    patience=PATIENCE,
    epsilon=1e-3
)