In [None]:
import pandas as pd
import torch
import torchvision.transforms as transforms


In [None]:
# Load the dataset
df = pd.read_csv('captions folder/results.csv')

In [None]:
# Preprocess the data
# 1. Resize the images
image_transformer = transforms.Compose([
 transforms.Resize((224, 224)),
 transforms.ToTensor()
])

In [None]:
# 2. Tokenize the captions
vocab = set()
for caption in df['caption']:
 vocab.update(caption.split())
# 3. Create a vocabulary of the most common words
vocab = list(vocab)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

In [None]:
# 4. Map each word in the captions to an integer index
captions = []
for caption in df['caption']:
    tokens = caption.split()
    indices = [word_to_idx[token] for token in tokens]
    captions.append(indices)
 
# 5. Convert the captions to tensors
captions = torch.tensor(captions)

In [None]:
# 6. Create a dataset object
class CaptionDataset(torch.utils.data.Dataset):

    def __init__(self, images, captions):
        self.images = images
        self.captions = captions

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        caption = self.captions[idx]
        return image, caption

In [None]:
dataset = CaptionDataset(df['image'], captions)


In [None]:
# Create a dataloader to batch the data
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle=True)