In [1]:
import torch
import pandas as pd
import os as os
import numpy as np

In [2]:
from datasets import load_dataset

ds = load_dataset("yerevann/coco-karpathy")
ds['train'].shape

(82783, 8)

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.encode("hi")

[101, 7632, 102]

In [13]:
ds['train'][0]

{'filepath': 'train2014',
 'sentids': [787980, 789366, 789888, 791316, 794853],
 'filename': 'COCO_train2014_000000057870.jpg',
 'imgid': 40504,
 'split': 'train',
 'sentences': ['A restaurant has modern wooden tables and chairs.',
  'A long restaurant table with rattan rounded back chairs.',
  'a long table with a plant on top of it surrounded with wooden chairs ',
  'A long table with a flower arrangement in the middle for meetings',
  'A table is adorned with wooden chairs with blue accents.'],
 'cocoid': 57870,
 'url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000057870.jpg'}

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from PIL import Image
import requests
from io import BytesIO
import torchvision.transforms as transforms
from datasets import DatasetDict
import logging
import random

class maindataset(Dataset):
    def __init__(self,dataset,tokenizer,transforms):
        self.dataset=dataset
        self.tokenizer=tokenizer
        self.transforms=transforms
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        item=self.dataset[index]
        sentences=item['sentences']
        text= random.choice(sentences)

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Process 'input_ids', 'attention_mask', etc.
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        try:
            response = requests.get(item['url'], stream=True, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert('RGB')
            img_tensor = self.transform(img)
        except Exception as e:
            logging.warning(f"Error loading image from URL {item['url']}: {e}")
            img_tensor = torch.zeros(3, 224, 224)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'image': img_tensor,
            'imgid': item['imgid'],
            'cocoid': item.get('cocoid', -1),
            'filename': item['filename'],
            'filepath': item['filepath']
        }
    

def create_dataloaders(dataset_dict, tokenizer_name="bert-base-uncased", batch_size=32, 
                       num_workers=4, use_all_sentences=False):
    """
    Create dataloaders using the predefined splits in the dataset
    
    Args:
        dataset_dict: HuggingFace DatasetDict containing the dataset splits
        tokenizer_name: Name of the tokenizer to use
        batch_size: Batch size for the dataloaders
        num_workers: Number of workers for data loading
        use_all_sentences: Whether to use all sentences or randomly select one
        
    Returns:
        Dictionary containing dataloaders for each split
    """
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # Create image transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])
    
    # Create datasets for each split
    datasets = {}
    dataloaders = {}
    
    for split, dataset in dataset_dict.items():
        datasets[split] = maindataset(
            dataset=dataset,
            tokenizer=tokenizer,
            transform=transform,
            use_all_sentences=use_all_sentences
        )
        
        shuffle = (split == 'train' or split == 'restval')  # Shuffle for training and restval
        
        dataloaders[split] = DataLoader(
            datasets[split],
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            pin_memory=True  # Helps speed up data transfer to GPU
        )
    
    return dataloaders
