In [1]:
import json
import scipy.io
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Oxford Flowers

In [2]:
paths = Path("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/jpg").rglob("*.jpg")
paths = sorted(paths)

imagelabels = scipy.io.loadmat("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/imagelabels.mat")
setid = scipy.io.loadmat("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/setid.mat")

label_name_df = pd.read_csv("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/oxford_flower_102_name.csv")
label_name_mapping = {int(row["Index"]) + 1: row["Name"] for _, row in label_name_df.iterrows()}

split_mapping = {}
for i in setid["trnid"].flatten().tolist():
    split_mapping[i] = "train"
for i in setid["valid"].flatten().tolist():
    split_mapping[i] = "val"
for i in setid["tstid"].flatten().tolist():
    split_mapping[i] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)
df["label"] = imagelabels["labels"].flatten()
df["label_name"] = df["label"].apply(lambda x: label_name_mapping[x])
df["split"] = df["image_name"].apply(lambda x: split_mapping[int(x.split("_")[-1].split(".")[0])])

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

8189


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/Oxford_Flowers...,image_00001.jpg,77,passion flower,test,1.jpg
1,/data/hanchong/open-source-data/Oxford_Flowers...,image_00002.jpg,77,passion flower,test,2.jpg
2,/data/hanchong/open-source-data/Oxford_Flowers...,image_00003.jpg,77,passion flower,test,3.jpg
3,/data/hanchong/open-source-data/Oxford_Flowers...,image_00004.jpg,77,passion flower,test,4.jpg
4,/data/hanchong/open-source-data/Oxford_Flowers...,image_00005.jpg,77,passion flower,test,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/Oxford_Flowers/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/1020 [00:00<?, ?it/s]

100%|██████████| 1020/1020 [00:00<00:00, 6743.83it/s]
100%|██████████| 1020/1020 [00:00<00:00, 7446.22it/s]
100%|██████████| 6149/6149 [00:00<00:00, 8523.02it/s]


# CUB200-2011

In [2]:
paths = Path("/data/hanchong/open-source-data/CUB200-2011/raw-data/images").rglob("*.jpg")
paths = sorted(paths)

index_split_df = pd.read_csv("/data/hanchong/open-source-data/CUB200-2011/raw-data/train_test_split.txt", sep=" ", header=None)
index_split_df.columns = ["Index", "Split"]

index_image_df = pd.read_csv("/data/hanchong/open-source-data/CUB200-2011/raw-data/images.txt", sep=" ", header=None)
index_image_df.columns = ["Index", "Image"]

split_df = pd.merge(index_split_df, index_image_df, on="Index")
split_mapping = {row["Image"].split("/")[-1]: {0: "test", 1: "train"}[row["Split"]] for _, row in split_df.iterrows()}

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name.split(".")[0]) for path in paths],
        "label_name": [path.parent.name.split(".")[-1] for path in paths],
    }
)
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

11788


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0001_796111.jpg,1,Black_footed_Albatross,test,1.jpg
1,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0002_55.jpg,1,Black_footed_Albatross,test,2.jpg
2,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0003_796136.jpg,1,Black_footed_Albatross,test,3.jpg
3,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0005_796090.jpg,1,Black_footed_Albatross,test,4.jpg
4,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0006_796065.jpg,1,Black_footed_Albatross,test,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/CUB200-2011/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/4994 [00:00<?, ?it/s]

100%|██████████| 4994/4994 [00:00<00:00, 7388.44it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7353.13it/s]
100%|██████████| 5794/5794 [00:00<00:00, 8633.80it/s]


# Stanford Dogs

In [2]:
folder = Path("/data/hanchong/open-source-data/Stanford_Dogs/raw-data/Images")
paths = folder.rglob("*.jpg")
paths = sorted(paths)

train_list = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Dogs/raw-data/train_list.mat")
test_list = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Dogs/raw-data/test_list.mat")

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

split_df = pd.concat(
    [
        pd.DataFrame(
            {
                "image_path": [folder / f[0] for f in train_list["file_list"].flatten().tolist()],
                "label": train_list["labels"].flatten().tolist(),
                "label_name": [f[0].split("/")[0] for f in train_list["file_list"].flatten().tolist()],
                "split": "train",
            }
        ),
        pd.DataFrame(
            {
                "image_path": [folder / f[0] for f in test_list["file_list"].flatten().tolist()],
                "label": test_list["labels"].flatten().tolist(),
                "label_name": [f[0].split("/")[0] for f in test_list["file_list"].flatten().tolist()],
                "split": "test",
            }
        ),
    ],
    ignore_index=True,
)

df = pd.merge(df, split_df, on="image_path")

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

20580


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10074.jpg,1,n02085620-Chihuahua,test,1.jpg
1,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10131.jpg,1,n02085620-Chihuahua,test,2.jpg
2,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10621.jpg,1,n02085620-Chihuahua,train,3.jpg
3,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_1073.jpg,1,n02085620-Chihuahua,test,4.jpg
4,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10976.jpg,1,n02085620-Chihuahua,train,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/Stanford_Dogs/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 11400/11400 [00:01<00:00, 7379.76it/s]
100%|██████████| 600/600 [00:00<00:00, 7531.57it/s]
100%|██████████| 8580/8580 [00:00<00:00, 8590.71it/s]


# Standford Cars

In [2]:
folder = Path("/data/hanchong/open-source-data/Stanford_Cars/raw-data")

train_paths = (folder / "cars_train").rglob("*.jpg")
train_paths = sorted(train_paths)

test_paths = (folder / "cars_test").rglob("*.jpg")
test_paths = sorted(test_paths)

cars_meta = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Cars/raw-data/devkit/cars_meta.mat")
class_names = cars_meta["class_names"].flatten().tolist()
class_names = [name[0] for name in class_names]

cars_train_annos = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Cars/raw-data/devkit/cars_train_annos.mat")

train_df = pd.DataFrame(
    {
        "image_name": [f[-1][0] for f in cars_train_annos["annotations"].flatten().tolist()],
        "label": [f[-2][0][0] for f in cars_train_annos["annotations"].flatten().tolist()],
    }
)
train_df["label_name"] = train_df["label"].apply(lambda x: class_names[x - 1])
train_df["split"] = "train"

cars_test_annos = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Cars/raw-data/cars_test_annos_withlabels.mat")

test_df = pd.DataFrame(
    {
        "image_name": [f[-1][0] for f in cars_test_annos["annotations"].flatten().tolist()],
        "label": [f[-2][0][0] for f in cars_test_annos["annotations"].flatten().tolist()],
    }
)
test_df["label_name"] = test_df["label"].apply(lambda x: class_names[x - 1])
test_df["split"] = "test"

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
).merge(train_df, on="image_name")

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
).merge(test_df, on="image_name")

df = pd.concat([train_df, test_df], ignore_index=True)

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

16185


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/Stanford_Cars/...,00001.jpg,14,Audi TTS Coupe 2012,train,1.jpg
1,/data/hanchong/open-source-data/Stanford_Cars/...,00002.jpg,3,Acura TL Sedan 2012,train,2.jpg
2,/data/hanchong/open-source-data/Stanford_Cars/...,00003.jpg,91,Dodge Dakota Club Cab 2007,train,3.jpg
3,/data/hanchong/open-source-data/Stanford_Cars/...,00004.jpg,134,Hyundai Sonata Hybrid Sedan 2012,train,4.jpg
4,/data/hanchong/open-source-data/Stanford_Cars/...,00005.jpg,106,Ford F-450 Super Duty Crew Cab 2012,train,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/Stanford_Cars/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/7164 [00:00<?, ?it/s]

100%|██████████| 7164/7164 [00:00<00:00, 7470.20it/s]
100%|██████████| 980/980 [00:00<00:00, 7739.78it/s]
100%|██████████| 8041/8041 [00:00<00:00, 8753.65it/s]


# FGVC Aircraft

In [3]:
paths = Path("/data/hanchong/open-source-data/FGVC_Aircraft/raw-data/data/images").rglob("*.jpg")
paths = sorted(paths)

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

label_name_mapping = {}
with open("/data/hanchong/open-source-data/FGVC_Aircraft/raw-data/data/variants.txt") as f:
    for index, line in enumerate(f.readlines()):
        label_name = line.strip()
        label_name_mapping[label_name] = index + 1

data_list = []
for path, split in [
    ("/data/hanchong/open-source-data/FGVC_Aircraft/raw-data/data/images_variant_train.txt", "train"),
    ("/data/hanchong/open-source-data/FGVC_Aircraft/raw-data/data/images_variant_val.txt", "val"),
    ("/data/hanchong/open-source-data/FGVC_Aircraft/raw-data/data/images_variant_test.txt", "test"),
]:
    with open(path) as f:
        for line in f.readlines():
            image_name = line.strip().split(" ", 1)[0]
            data_list.append(
                {
                    "image_name": image_name + ".jpg",
                    "label": label_name_mapping[line[len(image_name) + 1 :].strip()],
                    "label_name": line[len(image_name) + 1 :].strip(),
                    "split": split,
                }
            )

df = pd.merge(df, pd.DataFrame(data_list), on="image_name")

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

10000


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/FGVC_Aircraft/...,0034309.jpg,56,DC-8,val,1.jpg
1,/data/hanchong/open-source-data/FGVC_Aircraft/...,0034958.jpg,3,737-200,val,2.jpg
2,/data/hanchong/open-source-data/FGVC_Aircraft/...,0037511.jpg,57,DC-9-30,val,3.jpg
3,/data/hanchong/open-source-data/FGVC_Aircraft/...,0037512.jpg,3,737-200,test,4.jpg
4,/data/hanchong/open-source-data/FGVC_Aircraft/...,0038598.jpg,86,MD-11,train,5.jpg


In [4]:
output_folder = Path("/data/hanchong/open-source-data/FGVC_Aircraft/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/3334 [00:00<?, ?it/s]

100%|██████████| 3334/3334 [00:00<00:00, 7266.81it/s]
100%|██████████| 3333/3333 [00:00<00:00, 7380.75it/s]
100%|██████████| 3333/3333 [00:00<00:00, 8818.76it/s]


# Birdsnap

In [2]:
paths = Path("/data/hanchong/open-source-data/Birdsnap/raw-data/images").rglob("*.jpg")
paths = sorted(paths)

split_mapping = {}
with open("/data/hanchong/open-source-data/Birdsnap/birdsnap/test_images.txt") as f:
    for line in f.readlines()[1:]:
        image_name = line.strip()
        split_mapping[image_name] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])
df["split"] = df.apply(lambda x: split_mapping.get(f"{x['label_name']}/{x['image_name']}", "train"), axis=1)

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

39286


Unnamed: 0,image_path,image_name,label_name,label,split,image_name_new
0,/data/hanchong/open-source-data/Birdsnap/raw-d...,534070.jpg,Acadian_Flycatcher,1,val,1.jpg
1,/data/hanchong/open-source-data/Birdsnap/raw-d...,534076.jpg,Acadian_Flycatcher,1,train,2.jpg
2,/data/hanchong/open-source-data/Birdsnap/raw-d...,534079.jpg,Acadian_Flycatcher,1,train,3.jpg
3,/data/hanchong/open-source-data/Birdsnap/raw-d...,534080.jpg,Acadian_Flycatcher,1,train,4.jpg
4,/data/hanchong/open-source-data/Birdsnap/raw-d...,534083.jpg,Acadian_Flycatcher,1,train,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/Birdsnap/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 34968/34968 [00:04<00:00, 7681.61it/s]
100%|██████████| 2500/2500 [00:00<00:00, 7189.33it/s]
100%|██████████| 1818/1818 [00:00<00:00, 8817.32it/s]


# NABirds

In [4]:
paths = Path("/data/hanchong/open-source-data/NABirds/raw-data/images").rglob("*.*")
paths = sorted(paths)

label_name_mapping = {}
with open("/data/hanchong/open-source-data/NABirds/raw-data/classes.txt") as f:
    for line in f.readlines():
        label = line.strip().split(" ", 1)[0]
        label_name = line[len(label) + 1 :].strip()
        label_name_mapping[int(label)] = label_name

split_mapping = {}
with open("/data/hanchong/open-source-data/NABirds/raw-data/train_test_split.txt") as f:
    for line in f.readlines():
        image_name, split = line.strip().split(" ")
        split_mapping["".join(image_name.split("-"))] = {0: "test", 1: "train"}[int(split)]

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name) for path in paths],
    }
)
df["label_name"] = df["label"].apply(lambda x: label_name_mapping[x])
df["split"] = df["image_name"].apply(lambda x: split_mapping[x.split(".")[0]])

val_df = df[df["split"] == "train"].groupby("label").apply(lambda x: x.sample(n=2 if len(x) < 15 else 5, random_state=42), include_groups=False)
df.loc[val_df.index.get_level_values(1), "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

48562


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/NABirds/raw-da...,01f53d6bf5e449438d2bb79e0854bca4.jpg,295,Common Eider (Adult male),val,1.jpg
1,/data/hanchong/open-source-data/NABirds/raw-da...,074a068d75404dfc9e37bffc8b37265e.jpg,295,Common Eider (Adult male),test,2.jpg
2,/data/hanchong/open-source-data/NABirds/raw-da...,0daddfcbc9a54170ac06402bffeff37c.jpg,295,Common Eider (Adult male),train,3.jpg
3,/data/hanchong/open-source-data/NABirds/raw-da...,19371d9dd2874202b9c7948a5543ed2e.jpg,295,Common Eider (Adult male),test,4.jpg
4,/data/hanchong/open-source-data/NABirds/raw-da...,1fddd7c3b1b242eba5c020aaad4fb429.jpg,295,Common Eider (Adult male),train,5.jpg


In [6]:
output_folder = Path("/data/hanchong/open-source-data/NABirds/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 21202/21202 [00:02<00:00, 8892.52it/s]
100%|██████████| 2727/2727 [00:00<00:00, 8612.48it/s]
100%|██████████| 24633/24633 [00:02<00:00, 10293.98it/s]


# iNat2017

In [2]:
paths = Path("/data/hanchong/open-source-data/iNat2017/raw-data/train_val_images").rglob("*.jpg")
paths = sorted(paths)

split_mapping = {}
with open("/data/hanchong/open-source-data/iNat2017/raw-data/train2017.json") as f:
    train2019 = json.load(f)
    for image in train2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "train"

with open("/data/hanchong/open-source-data/iNat2017/raw-data/val2017.json") as f:
    val2019 = json.load(f)
    for image in val2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "val"


label_name_mapping = {}
for category in train2019["categories"]:
    label_name_mapping[category["name"]] = int(category["id"])

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [label_name_mapping[path.parent.name] for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

test_paths = Path("/data/hanchong/open-source-data/iNat2017/raw-data/test2017").rglob("*.jpg")
test_paths = sorted(test_paths)

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
        "label": -1,
    }
)
test_df["split"] = "test"

df = pd.concat([df, test_df], ignore_index=True)
df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

857877


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/iNat2017/raw-d...,0bb15d607734ee8ed27d8f45e88bf426.jpg,4745,Abudefduf saxatilis,val,1.jpg
1,/data/hanchong/open-source-data/iNat2017/raw-d...,10d4c817f42724f907bdf5f640d4d472.jpg,4745,Abudefduf saxatilis,val,2.jpg
2,/data/hanchong/open-source-data/iNat2017/raw-d...,19892e351c95f9ee4e25e8667fc3f7e9.jpg,4745,Abudefduf saxatilis,train,3.jpg
3,/data/hanchong/open-source-data/iNat2017/raw-d...,1faf0ba615708a021d080c5a8898dd8e.jpg,4745,Abudefduf saxatilis,train,4.jpg
4,/data/hanchong/open-source-data/iNat2017/raw-d...,206dae61fd527c7d47bf545eb1bddb36.jpg,4745,Abudefduf saxatilis,train,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/iNat2017/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 579184/579184 [01:07<00:00, 8625.24it/s]
100%|██████████| 95986/95986 [00:11<00:00, 8719.89it/s]
100%|██████████| 182707/182707 [00:18<00:00, 9918.66it/s] 


# iNat2019

In [2]:
paths = Path("/data/hanchong/open-source-data/iNat2019/raw-data/train_val2019").rglob("*.jpg")
paths = sorted(paths)

with open("/data/hanchong/open-source-data/iNat2019/raw-data/categories.json") as f:
    categories = json.load(f)

split_mapping = {}
with open("/data/hanchong/open-source-data/iNat2019/raw-data/train2019.json") as f:
    train2019 = json.load(f)
    for image in train2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "train"

with open("/data/hanchong/open-source-data/iNat2019/raw-data/val2019.json") as f:
    val2019 = json.load(f)
    for image in val2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "val"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name) for path in paths],
    }
)
df["label_name"] = df["label"].apply(lambda x: categories[x]["name"])
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

test_paths = Path("/data/hanchong/open-source-data/iNat2019/raw-data/test2019").rglob("*.jpg")
test_paths = sorted(test_paths)

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
        "label": -1,
    }
)
test_df["split"] = "test"

df = pd.concat([df, test_df], ignore_index=True)
df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

303593


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/iNat2019/raw-d...,0042d05b4ffbd5a1ce2fc56513a7777e.jpg,153,Lithobates sphenocephalus,train,1.jpg
1,/data/hanchong/open-source-data/iNat2019/raw-d...,006f69e838b87cfff3d12120795c4ada.jpg,153,Lithobates sphenocephalus,train,2.jpg
2,/data/hanchong/open-source-data/iNat2019/raw-d...,00c1bf968b20839ead054b3ab9eb1ce2.jpg,153,Lithobates sphenocephalus,train,3.jpg
3,/data/hanchong/open-source-data/iNat2019/raw-d...,011ae401924d635371dc70e059b9748b.jpg,153,Lithobates sphenocephalus,train,4.jpg
4,/data/hanchong/open-source-data/iNat2019/raw-d...,013862c72d6dc1344892e96af1130d76.jpg,153,Lithobates sphenocephalus,train,5.jpg


In [3]:
output_folder = Path("/data/hanchong/open-source-data/iNat2019/processed-data")
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 265213/265213 [00:30<00:00, 8566.46it/s]
100%|██████████| 3030/3030 [00:00<00:00, 8138.10it/s]
100%|██████████| 35350/35350 [00:03<00:00, 10034.66it/s]
