In [7]:
import random
import tqdm
import numpy as np
from matplotlib import pyplot as plt
from transformers import AutoTokenizer

# get a llama tokenizer
# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")


def validate_dataset(dataset, n_samples=2000):
    n_tokens = []
    resolutions = []
    for _ in tqdm.tqdm(range(n_samples)):
        sample = dataset[random.randint(0, len(dataset)-1)]
        n_tokens.append(tokenizer.encode(sample['text'], return_tensors='pt').shape[-1])
        resolutions.append(np.array(sample['image'].size))

    resolutions = np.array(resolutions)
    print('Resolution:', resolutions.shape)

    plt.figure(figsize=(20, 5))
    plt.title('Average Tokens: {:.2f}'.format(sum(n_tokens) / len(n_tokens)))
    plt.hist(n_tokens, bins=500)
    plt.show()

    plt.figure(figsize=(20, 5))
    plt.title('Average Resolution ' + str(np.mean(resolutions, axis=0)))
    plt.hist(resolutions[:, 0], bins=200, label='Width')
    plt.hist(resolutions[:, 1], bins=200, label='Height')
    plt.show()


def visualize_sample(dataset, n_samples=5):
    print('Dataset size:', len(dataset))
    for i in range(n_samples):
        # sample = dataset[i]
        sample = dataset[random.randint(0, len(dataset)-1)]
        plt.imshow(sample['image'])
        plt.show()
        print(sample['text'])


In [None]:

from data import (
    ShareGPT4V,
    ImageNet,
    Cambrian,
    PixmoDataset,
    CLEVRCaption
)

split = 'train'

# dataset = ImageNet(root='/datasets01/imagenet_full_size/061417', split=split)
# dataset = ImageNet(root='/datasets01/imagenet-22k/062717', split=split)

# dataset = ShareGPT4V(
#     root='/private/home/delong/workspace/data/ShareGPT4V',
#     annotation='sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json',
#     split=split)

# dataset = ShareGPT4V(
#     root='/private/home/delong/workspace/data/ShareGPT4V',
#     split='share-captioner_coco_lcs_sam_1246k_1107.json')

# dataset = ShareGPT4V(
#     root='/private/home/delong/workspace/data/ShareGPT4V',
#     annotation='sharegpt4v_instruct_gpt4-vision_cap100k.json',
#     split=split)

# dataset = Cambrian(
#     root='/private/home/delong/workspace/data/Cambrian-10M',
#     max_samples=None)


# dataset = PixmoDataset(
#     root='/private/home/delong/workspace/data/pixmo-cap',
#     split=split)

dataset = CLEVRCaption(root='/private/home/delong/workspace/data/clevr-caption', split=split)

# TODO:
# dataset = ImageParagraphCaptioning(root='/home/dchenbs/workspace/datasets/VisualGenome', split=split)
# dataset = CocoCaptionDataset(root='/share/datasets/coco2017', split=split)

print(len(dataset))

In [None]:

visualize_sample(dataset, n_samples=5)

In [None]:

validate_dataset(dataset, n_samples=1000)