In [None]:
import os
import pickle
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
from torch.nn.utils.rnn import pad_sequence

In [None]:
WORKING_DIR = 'working'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
vgg16 = models.vgg16(pretrained=True)

In [None]:
vgg16_features = nn.Sequential(*list(vgg16.children())[:-1])

In [None]:
vgg16_features = vgg16_features.to(device)

In [None]:
print(vgg16_features)

In [None]:
def preprocess_image(image_path):
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to 224x224
        transforms.ToTensor(),         # Convert image to PyTorch tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
    ])
    image = Image.open(image_path).convert('RGB')  # Ensure 3 channels (RGB)
    return preprocess(image).unsqueeze(0).to(device)  # Add batch dimension and move to device

In [None]:
def tokenize_text(captions):
    # Ensure captions is a list of strings
    if not isinstance(captions, list) or not all(isinstance(c, str) for c in captions):
        raise ValueError("Input captions must be a list of strings.")

    # Tokenize the captions
    encoded = tokenizer(
        captions,
        padding=True,          # Pad sequences to the same length
        truncation=True,       # Truncate sequences that are too long
        return_tensors="pt",   # Return PyTorch tensors
    )
    
    # Return the 'input_ids' tensor
    return encoded["input_ids"].to(device)

In [None]:
def process_sequences(tokenized_captions, vocab, max_length):
    # Convert tokens to indices
    sequences = [[vocab[token] for token in caption if token in vocab] for caption in tokenized_captions]
    # Pad sequences to the maximum length
    padded_sequences = pad_sequence([torch.tensor(seq) for seq in sequences], 
                                     batch_first=True, 
                                     padding_value=vocab['<pad>']).to(device)
    return padded_sequences

In [None]:
captions = ["The cat sits on the mat.", "A dog barks loudly."]

In [None]:
tokenized_captions = tokenize_text(captions)
print("Tokenized Captions:", tokenized_captions)

In [None]:
padded_sequences = process_sequences(tokenized_captions, vocab, max_length=10)

In [None]:
print("Padded sequences:", padded_sequences)

In [None]:
class CaptionGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CaptionGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, features, captions):
        captions = self.embedding(captions)
        inputs = torch.cat((features.unsqueeze(1), captions), dim=1)
        hiddens, _ = self.lstm(inputs)
        outputs = self.fc(hiddens)
        return outputs

In [None]:
# Initialize caption generator
vocab_size = len(vocab)
embed_dim = 256
hidden_dim = 512
caption_model = CaptionGenerator(vocab_size, embed_dim, hidden_dim).to(device)

In [None]:
print("Caption Generator Model:", caption_model)

In [None]:
def generate_caption(image_path):
    # Preprocess image
    image_tensor = preprocess_image(image_path)
    
    # Extract features using VGG16
    with torch.no_grad():
        features = vgg16_features(image_tensor).squeeze()
    
    # Dummy captions for testing (replace with actual captions for real use)
    captions = torch.tensor([[1, 3, 0, 0, 0]]).to(device)  # Example input sequence
    
    # Generate caption
    outputs = caption_model(features, captions)
    print("Generated output:", outputs)


In [None]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms

class ImageCaptionDataset(Dataset):
    def __init__(self, image_dir, captions_file, tokenizer, transform=None, max_length=50):
        """
        Args:
            image_dir (str): Directory with all the images.
            captions_file (str): Path to the file containing image-caption pairs.
            tokenizer (object): Tokenizer to process the captions.
            transform (callable, optional): Transformations for images.
            max_length (int): Maximum length for tokenized captions.
        """
        self.image_dir = image_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Load image-caption pairs
        with open(captions_file, "r") as file:
            lines = file.readlines()
            self.image_caption_pairs = [line.strip().split("\t") for line in lines]

    def __len__(self):
        return len(self.image_caption_pairs)

    def __getitem__(self, idx):
        image_name, caption = self.image_caption_pairs[idx]
        image_path = os.path.join(self.image_dir, image_name)
        
        # Load and transform image
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        # Tokenize caption
        encoded_caption = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        
        return image, encoded_caption["input_ids"].squeeze(0)


In [None]:

image_dir = working  
captions_file = "path_to_captions.txt" 
# Create dataset
dataset = ImageCaptionDataset(
    image_dir=image_dir,
    captions_file=captions_file,
    tokenizer=tokenizer,   
    transform=transform,
    max_length=50         
)

# Create DataLoader
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=32,          
    shuffle=True,
    num_workers=4,          
    pin_memory=True         s
)


In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(caption_model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    caption_model.train()
    epoch_loss = 0
    for images, captions in dataloader:  # Replace `dataloader` with your actual DataLoader
        images = images.to(device)
        captions = captions.to(device)
        
        # Extract features
        features = vgg16_features(images).view(images.size(0), -1)
        
        # Forward pass
        outputs = caption_model(features, captions[:, :-1])  # Input captions excluding the last token
        loss = criterion(outputs.view(-1, vocab_size), captions[:, 1:].reshape(-1))  # Target captions excluding the first token
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}")


In [None]:

#Saving the Model
def save_model(model, optimizer, epoch, path="model_checkpoint.pth"):
    
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }, path)
    print(f"Model saved to {path}")


In [None]:
# Load the model
def load_model(model, optimizer, path="model_checkpoint.pth", device="cpu"):
    
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Model loaded from {path}, resuming from epoch {epoch}")
    return epoch
