In [1]:
import os
import shutil

import pandas as pd

from huggingface_hub import HfApi

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_raw_dataset = "./../../../dataset_processed"
path_raw_fundus = os.path.join(path_raw_dataset, "fundus_image")
path_raw_mask = os.path.join(path_raw_dataset, "mask_image")
path_raw_annot = os.path.join(path_raw_dataset, "annotations")

path_filtered_dataset = "./../../../dataset_used"
path_filtered_fundus = os.path.join(path_filtered_dataset, "fundus_image")
path_filtered_mask = os.path.join(path_filtered_dataset, "mask_image")

path_hf_dataset = "./../../../dataset_hf"

In [3]:
for dataset_type in ["raw", "filtered", "metadata"]:
    if dataset_type == "metadata":
        os.makedirs(os.path.join(path_hf_dataset, dataset_type), exist_ok=True)
        continue
    for new_dir in ["fundus", "mask"]:
        os.makedirs(os.path.join(path_hf_dataset, dataset_type, new_dir), exist_ok=True)

In [4]:
raw_fundus_imgs = {}
for label in os.listdir(path_raw_fundus):
    raw_fundus_imgs[label] = os.listdir(os.path.join(path_raw_fundus, label))
raw_mask_imgs = {}
for label in os.listdir(path_raw_mask):
    raw_mask_imgs[label] = os.listdir(os.path.join(path_raw_mask, label))

filtered_fundus_imgs = list(os.listdir(path_filtered_fundus))
filtered_mask_imgs = list(os.listdir(path_filtered_mask))

annot_imgs = {}
for label in os.listdir(path_raw_annot):
    annot_imgs[label] = [file for file in os.listdir(os.path.join(path_raw_annot, label)) if file.endswith(".json")]

In [5]:
for label, imgs in raw_fundus_imgs.items():
    for img in imgs:
        shutil.copy(os.path.join(path_raw_fundus, label, img),
                    os.path.join(path_hf_dataset, "raw", "fundus", img))
print("Raw Fundus images copied")
for label, imgs in raw_mask_imgs.items():
    for img in imgs:
        shutil.copy(os.path.join(path_raw_mask, label, img),
                    os.path.join(path_hf_dataset, "raw", "mask", img))
print("Raw Mask images copied")

for img in filtered_fundus_imgs:
    shutil.copy(os.path.join(path_filtered_fundus, img),
                os.path.join(path_hf_dataset, "filtered", "fundus", img))
print("Filtered Fundus images copied")
for img in filtered_mask_imgs:
    shutil.copy(os.path.join(path_filtered_mask, img),
                os.path.join(path_hf_dataset, "filtered", "mask", img))
print("Filtered Mask images copied")

for label, annots in annot_imgs.items():
    for annot in annots:
        shutil.copy(os.path.join(path_raw_annot, label, annot),
                    os.path.join(path_hf_dataset, "metadata", f"{label}_{annot}"))
print("Annotations copied")

Raw Fundus images copied
Raw Mask images copied
Filtered Fundus images copied
Filtered Mask images copied
Annotations copied


In [6]:
shutil.make_archive(base_name=os.path.join(path_hf_dataset, "raw_image"), format='zip', root_dir=path_hf_dataset, base_dir="raw")
shutil.make_archive(base_name=os.path.join(path_hf_dataset, "filtered_image"), format='zip', root_dir=path_hf_dataset, base_dir="filtered")

'd:\\Programming\\Python\\Research Prof. Anindita\\dataset_hf\\filtered_image.zip'

In [7]:
shutil.rmtree(os.path.join(path_hf_dataset, "raw"))
shutil.rmtree(os.path.join(path_hf_dataset, "filtered"))

In [9]:
metadata = pd.DataFrame(columns=["fundus_image_raw"])

for label, imgs in raw_fundus_imgs.items():
    for img in imgs:
        metadata.loc[len(metadata)] = [img]
up_raw_mask_imgs = {"labels": [], "mask_image_raw": []}
for label, imgs in raw_mask_imgs.items():
    up_raw_mask_imgs["mask_image_raw"].extend(imgs)

metadata["mask_image_raw"] = up_raw_mask_imgs["mask_image_raw"]
metadata["included_in_filtered_ds"] = metadata.fundus_image_raw.isin(filtered_fundus_imgs)

metadata.to_csv(os.path.join(path_hf_dataset, "metadata", "imgs_filtered_info.csv"), index=False)
metadata.head(5)

Unnamed: 0,fundus_image_raw,mask_image_raw,included_in_filtered_ds
0,fff_1_100287_l_).jpg,fff_1_100287_l_).png,True
1,fff_1_100287_r_).jpg,fff_1_100287_r_).png,True
2,fff_1_10207_l_1.jpg,fff_1_10207_l_1.png,True
3,fff_1_10207_l_2.jpg,fff_1_10207_l_2.png,True
4,fff_1_10207_r_1.jpg,fff_1_10207_r_1.png,True


In [10]:
api = HfApi()

In [11]:
repo = api.create_repo(
    repo_id="glaucoma-detection-for-segmentation",
    repo_type="dataset",
    private=False,
    exist_ok=True,
)

In [18]:
# upload the file
api.upload_file(
    repo_id=repo.repo_id,
    path_or_fileobj=os.path.join(path_hf_dataset, "README.md"),
    path_in_repo="README.md",
    repo_type=repo.repo_type,
    commit_message="refine the content of README.md",
)

CommitInfo(commit_url='https://huggingface.co/datasets/bugi-sulistiyo/glaucoma-detection-for-segmentation/commit/87b111793e47852302f14d2b1b889b185268e758', commit_message='refine the content of README.md', commit_description='', oid='87b111793e47852302f14d2b1b889b185268e758', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bugi-sulistiyo/glaucoma-detection-for-segmentation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bugi-sulistiyo/glaucoma-detection-for-segmentation'), pr_revision=None, pr_num=None)

In [None]:
# upload the dataset
api.upload_folder(
    repo_id=repo.repo_id,
    folder_path=path_hf_dataset,
    repo_type="dataset",
    commit_message="Populating the dataset",
)

filtered_image.zip:   0%|          | 0.00/301M [00:00<?, ?B/s]
[A
filtered_image.zip:   0%|          | 16.4k/301M [00:00<1:44:10, 48.1kB/s]
filtered_image.zip:   0%|          | 147k/301M [00:00<12:26, 403kB/s]    
filtered_image.zip:   0%|          | 311k/301M [00:00<10:20, 485kB/s]
filtered_image.zip:   0%|          | 524k/301M [00:00<06:29, 772kB/s]
filtered_image.zip:   0%|          | 623k/301M [00:01<06:09, 813kB/s]
filtered_image.zip:   0%|          | 754k/301M [00:01<05:52, 850kB/s]
filtered_image.zip:   0%|          | 852k/301M [00:01<06:49, 733kB/s]
[A
filtered_image.zip:   0%|          | 1.02M/301M [00:01<07:10, 697kB/s]
filtered_image.zip:   0%|          | 1.13M/301M [00:01<06:47, 735kB/s]
filtered_image.zip:   0%|          | 1.36M/301M [00:01<05:41, 877kB/s]
[A
filtered_image.zip:   0%|          | 1.46M/301M [00:02<05:55, 842kB/s]
filtered_image.zip:   1%|          | 1.56M/301M [00:02<06:38, 751kB/s]
filtered_image.zip:   1%|          | 1.64M/301M [00:02<07:17, 685kB/s]
f

CommitInfo(commit_url='https://huggingface.co/datasets/bugi-sulistiyo/glaucoma-detection-for-segmentation/commit/f16414e3123374d00772c5894289f5addc07a0fa', commit_message='Populate the dataset', commit_description='', oid='f16414e3123374d00772c5894289f5addc07a0fa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bugi-sulistiyo/glaucoma-detection-for-segmentation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bugi-sulistiyo/glaucoma-detection-for-segmentation'), pr_revision=None, pr_num=None)