In [4]:
import os
import shutil

import pandas as pd

from huggingface_hub import HfApi

In [5]:
path_raw_dataset = "./../../../dataset_processed"
path_raw_fundus = os.path.join(path_raw_dataset, "fundus_image")
path_raw_mask = os.path.join(path_raw_dataset, "mask_image")
path_raw_annot = os.path.join(path_raw_dataset, "annotations")

path_filtered_dataset = "./../../../dataset_used"
path_filtered_fundus = os.path.join(path_filtered_dataset, "fundus_image")
path_filtered_mask = os.path.join(path_filtered_dataset, "mask_image")

path_hf_dataset = "./../../../dataset_hf"

In [6]:
for dataset_type in ["raw", "filtered", "metadata"]:
    if dataset_type == "metadata":
        os.makedirs(os.path.join(path_hf_dataset, dataset_type), exist_ok=True)
        continue
    for new_dir in ["fundus", "mask"]:
        os.makedirs(os.path.join(path_hf_dataset, dataset_type, new_dir), exist_ok=True)

In [7]:
raw_fundus_imgs = {}
for label in os.listdir(path_raw_fundus):
    raw_fundus_imgs[label] = os.listdir(os.path.join(path_raw_fundus, label))
raw_mask_imgs = {}
for label in os.listdir(path_raw_mask):
    raw_mask_imgs[label] = os.listdir(os.path.join(path_raw_mask, label))

filtered_fundus_imgs = list(os.listdir(path_filtered_fundus))
filtered_mask_imgs = list(os.listdir(path_filtered_mask))

annot_imgs = {}
for label in os.listdir(path_raw_annot):
    annot_imgs[label] = [file for file in os.listdir(os.path.join(path_raw_annot, label)) if file.endswith(".json")]

In [8]:
for label, imgs in raw_fundus_imgs.items():
    for img in imgs:
        shutil.copy(os.path.join(path_raw_fundus, label, img),
                    os.path.join(path_hf_dataset, "raw", "fundus", img))
print("Raw Fundus images copied")
for label, imgs in raw_mask_imgs.items():
    for img in imgs:
        shutil.copy(os.path.join(path_raw_mask, label, img),
                    os.path.join(path_hf_dataset, "raw", "mask", img))
print("Raw Mask images copied")

for img in filtered_fundus_imgs:
    shutil.copy(os.path.join(path_filtered_fundus, img),
                os.path.join(path_hf_dataset, "filtered", "fundus", img))
print("Filtered Fundus images copied")
for img in filtered_mask_imgs:
    shutil.copy(os.path.join(path_filtered_mask, img),
                os.path.join(path_hf_dataset, "filtered", "mask", img))
print("Filtered Mask images copied")

for label, annots in annot_imgs.items():
    for annot in annots:
        shutil.copy(os.path.join(path_raw_annot, label, annot),
                    os.path.join(path_hf_dataset, "metadata", f"{label}_{annot}"))
print("Annotations copied")

Raw Fundus images copied
Raw Mask images copied
Filtered Fundus images copied
Filtered Mask images copied
Annotations copied


In [18]:
shutil.make_archive(os.path.join(path_hf_dataset, "raw"), 'zip', path_hf_dataset)

'd:\\Programming\\Python\\Research Prof. Anindita\\dataset_hf\\raw.zip'

In [9]:
metadata = pd.DataFrame(columns=["labels", "fundus_image"])

for label, imgs in raw_fundus_imgs.items():
    for img in imgs:
        metadata.loc[len(metadata)] = [label, img]
up_raw_mask_imgs = {"labels": [], "mask_image": []}
for label, imgs in raw_mask_imgs.items():
    up_raw_mask_imgs["labels"].append([label for _ in range(len(imgs))])
    up_raw_mask_imgs["mask_image"].extend(imgs)

metadata["mask_image"] = up_raw_mask_imgs["mask_image"]
metadata["temp_id_fundus"] = metadata["fundus_image"].apply(lambda x: x.split(".")[0])
metadata["temp_id_mask"] = metadata["mask_image"].apply(lambda x: x.split(".")[0])
metadata["validated"] = metadata["temp_id_fundus"] == metadata["temp_id_mask"]

metadata.head(5)

Unnamed: 0,labels,fundus_image,mask_image,temp_id_fundus,temp_id_mask,validated
0,glaucoma,fff_1_100287_l_).jpg,fff_1_100287_l_).png,fff_1_100287_l_),fff_1_100287_l_),True
1,glaucoma,fff_1_100287_r_).jpg,fff_1_100287_r_).png,fff_1_100287_r_),fff_1_100287_r_),True
2,glaucoma,fff_1_10207_l_1.jpg,fff_1_10207_l_1.png,fff_1_10207_l_1,fff_1_10207_l_1,True
3,glaucoma,fff_1_10207_l_2.jpg,fff_1_10207_l_2.png,fff_1_10207_l_2,fff_1_10207_l_2,True
4,glaucoma,fff_1_10207_r_1.jpg,fff_1_10207_r_1.png,fff_1_10207_r_1,fff_1_10207_r_1,True


In [10]:
metadata.validated.value_counts()

validated
True    419
Name: count, dtype: int64

In [11]:
api = HfApi()

In [None]:
repo = api.create_repo(
    repo_id="glaucoma-detection-for-segmentation",
    repo_type="dataset",
    private=False,
    exist_ok=True,
)

In [None]:
api.upload_folder(
    repo_id=repo.repo_id,
    folder_path=path_hf_dataset,
    repo_type="dataset",
    commit_message="Populate the dataset"
)

fff_0_111784_l_y.jpg:   0%|          | 0.00/874k [00:00<?, ?B/s]
[A

[A[A


[A[A[A



fff_0_111784_l_y.jpg:   2%|▏         | 16.4k/874k [00:00<00:23, 36.7kB/s]



[A[A[A[A
[A

[A[A
fff_0_111784_l_y.jpg:  13%|█▎        | 115k/874k [00:00<00:03, 243kB/s]  



[A[A[A[A

[A[A



fff_0_111784_l_y.jpg:  24%|██▍       | 213k/874k [00:00<00:01, 365kB/s]
[A

[A[A
fff_0_111784_l_y.jpg:  32%|███▏      | 279k/874k [00:00<00:01, 362kB/s]



[A[A[A[A



[A[A[A[A

[A[A
fff_0_111784_l_y.jpg:  54%|█████▍    | 475k/874k [00:01<00:00, 531kB/s]



fff_0_111784_l_y.jpg:  64%|██████▍   | 557k/874k [00:01<00:00, 528kB/s]
[A



[A[A[A[A

fff_0_111784_l_y.jpg:  71%|███████▏  | 623k/874k [00:01<00:00, 385kB/s]
[A



[A[A[A[A

[A[A

fff_0_111784_l_y.jpg:  94%|█████████▍| 819k/874k [00:02<00:00, 401kB/s]

[A[A

fff_0_111784_r_y.jpg: 100%|██████████| 871k/871k [00:02<00:00, 310kB/s] 
fff_0_113060_l_3.jpg: 100%|██████████| 849k/849k [00:02<00:00, 288kB/s]
fff_0_113060_l

KeyboardInterrupt: 