In [1]:
import os
import tarfile

compressed_file_path = './CUB_200_2011.tgz'
extracted_dir_path = './CUBData'

In [2]:
with tarfile.open(compressed_file_path, 'r:gz') as tar:
    tar.extractall(extracted_dir_path)

In [3]:
labels_path = './CUBData/CUB_200_2011/image_class_labels.txt'
image_labels = {}
with open(labels_path, 'r') as labels_file:
    for line in labels_file:
        image_id, label = line.strip().split()
        image_labels[image_id] = label

In [4]:
images_path = './CUBData/CUB_200_2011/images.txt'
image_paths = {}
with open(images_path, 'r') as images_file:
    for line in images_file:
        image_id, image_path = line.strip().split()
        image_paths[image_id] = image_path

In [27]:
all_image_paths = []
all_labels = []

for image_id in image_paths:
    if image_id in image_labels:
        image_path = os.path.join('./CUBData/CUB_200_2011/images', image_paths[image_id])
        label = image_labels[image_id]
        all_image_paths.append(image_path)
        all_labels.append(label)

In [30]:
for i in range(5):
    print(f"Image Path: {all_image_paths[i]}, Label: {all_labels[i]}")

# Now, 'all_image_paths' and 'all_labels' contain the lists you requested.

Image Path: ./CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg, Label: 1
Image Path: ./CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0009_34.jpg, Label: 1
Image Path: ./CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0002_55.jpg, Label: 1
Image Path: ./CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0074_59.jpg, Label: 1
Image Path: ./CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0014_89.jpg, Label: 1


In [31]:
all_labels[:5]

['1', '1', '1', '1', '1']

In [32]:
all_image_paths[:5]

['./CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg',
 './CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0009_34.jpg',
 './CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0002_55.jpg',
 './CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0074_59.jpg',
 './CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0014_89.jpg']

In [96]:
class CLIPDataset():
    """
    Class which creates dataset with joint labels and images itself.
    It is convenient to use CLIPDataset with HuggingFace.

    Usage:
        dataset = CLIPDataset(list_image_path=image_paths, list_txt=labels)

    For HuggingFace compatibility:
        hf_dataset = Dataset.from_dict({
            "image_file_path": dataset.image_path,
            "image": [Image.open(image_path) for image_path in dataset.image_path],
            "labels": dataset.label
         })
    """
    def __init__(self, list_image_path, list_txt):
        """
        Args:
            list_image_path: list of paths to images in memory
            list_txt: list of corresponding labels
        """
        self.image_path = list_image_path
        self.label = list_txt

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        image_path = self.image_path[idx]
        image = Image.open(image_path)
        label = self.label[idx]
        return {"image_file_path": image_path, "image": image, "labels": label}

In [99]:
dataset = CLIPDataset(list_image_path=all_image_paths, list_txt=[x - 1 for x in list(map(int, all_labels))])

In [100]:
from PIL import Image

dataset[0]

{'image_file_path': './CUBData/CUB_200_2011/images/001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x335>,
 'labels': 0}

In [101]:
dataset[505]

{'image_file_path': './CUBData/CUB_200_2011/images/010.Red_winged_Blackbird/Red_Winged_Blackbird_0109_4454.jpg',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x397>,
 'labels': 9}

In [102]:
from datasets import Dataset

hf_dataset = Dataset.from_dict({
            "image_file_path": dataset.image_path[:4000],
            "image": [Image.open(image_path) for image_path in dataset.image_path[:4000]],
            "labels": dataset.label[:4000]
         })

In [103]:
hf_dataset2 = Dataset.from_dict({
            "image_file_path": dataset.image_path[4000:8000],
            "image": [Image.open(image_path) for image_path in dataset.image_path[4000:8000]],
            "labels": dataset.label[4000:8000]
         })

In [104]:
hf_dataset3 = Dataset.from_dict({
            "image_file_path": dataset.image_path[8000:],
            "image": [Image.open(image_path) for image_path in dataset.image_path[8000:]],
            "labels": dataset.label[8000:]
         })

In [105]:
from datasets import concatenate_datasets


hf_dataset = concatenate_datasets([hf_dataset, hf_dataset2, hf_dataset3])

In [106]:
hf_dataset.features

{'image_file_path': Value(dtype='string', id=None),
 'image': Image(decode=True, id=None),
 'labels': Value(dtype='int64', id=None)}

In [107]:
names = []
with open('./CUBData/CUB_200_2011/classes.txt', 'r') as f:
    for line in f:
        class_id, class_name = line.strip().split()
        class_name = class_name[4:]
        class_name = class_name.replace('_', ' ')
        names.append(class_name.lower())

In [112]:
from datasets import ClassLabel

class_label = ClassLabel(num_classes=200, names=names)

In [113]:
new_features = hf_dataset.features.copy()
new_features["labels"] = ClassLabel(num_classes=200, names=names)
casted_hf_dataset = hf_dataset.cast(new_features)

Casting the dataset:   0%|          | 0/11788 [00:00<?, ? examples/s]

In [122]:
import huggingface_hub

huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [123]:
casted_hf_dataset.push_to_hub("Andron00e/CUB200-custom")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/3930 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Map:   0%|          | 0/3929 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Map:   0%|          | 0/3929 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]