In [1]:
import json
import scipy.io
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Primary Dataset

## Birdsnap

In [4]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Birdsnap/")

paths = sorted((folder / "raw-data" / "images").rglob("*.*"))
print(len(paths))

split_mapping = {}
with open(folder / "birdsnap" / "test_images.txt") as f:
    for line in f.readlines()[1:]:
        image_name = line.strip()
        split_mapping[image_name] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])
df["split"] = df.apply(lambda x: split_mapping.get(f"{x['label_name']}/{x['image_name']}", "train"), axis=1)

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

39332
39332


Unnamed: 0,image_path,image_name,label_name,label,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,534070.jpg,Acadian_Flycatcher,1,val,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,534076.jpg,Acadian_Flycatcher,1,train,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,534079.jpg,Acadian_Flycatcher,1,train,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,534080.jpg,Acadian_Flycatcher,1,train,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,534083.jpg,Acadian_Flycatcher,1,train,5.jpg


In [5]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 35011/35011 [00:03<00:00, 9018.24it/s]
100%|██████████| 2500/2500 [00:00<00:00, 8921.07it/s]
100%|██████████| 1821/1821 [00:00<00:00, 10528.23it/s]


## Country211

In [6]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Country211")

paths = sorted((folder / "raw-data" / "Images").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
        "split": [path.parent.parent.name for path in paths],
    }
)
df["split"] = df["split"].apply(lambda x: "val" if x == "valid" else x)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

63300
63300


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,1056316_42.461143_1.501972.jpg,AD,test,1,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,1140872_42.520573_1.684684.jpg,AD,test,1,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,1306731_42.506203_1.527056.jpg,AD,test,1,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,1320521_42.455507_1.462726.jpg,AD,test,1,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,1375030_42.539943_1.720669.jpg,AD,test,1,5.jpg


In [8]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 31650/31650 [00:03<00:00, 8962.01it/s]
100%|██████████| 10550/10550 [00:01<00:00, 8970.23it/s]
100%|██████████| 21100/21100 [00:02<00:00, 10477.34it/s]


## CUB-200-2011

In [None]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/CUB-200-2011")

paths = sorted((folder / "raw-data" / "images").rglob("*.*/*.*"))
print(len(paths))

index_split_df = pd.read_csv(folder / "raw-data" / "train_test_split.txt", sep=" ", header=None)
index_split_df.columns = ["Index", "Split"]

index_image_df = pd.read_csv(folder / "raw-data" / "images.txt", sep=" ", header=None)
index_image_df.columns = ["Index", "Image"]

split_df = pd.merge(index_split_df, index_image_df, on="Index")
split_mapping = {row["Image"].split("/")[-1]: {0: "test", 1: "train"}[row["Split"]] for _, row in split_df.iterrows()}

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name.split(".")[0]) for path in paths],
        "label_name": [path.parent.name.split(".")[-1] for path in paths],
    }
)
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

val_df = df[df["split"] == "train"].groupby("label").sample(n=2, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

11788
11788


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,Black_Footed_Albatross_0001_796111.jpg,1,Black_footed_Albatross,test,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,Black_Footed_Albatross_0002_55.jpg,1,Black_footed_Albatross,test,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,Black_Footed_Albatross_0003_796136.jpg,1,Black_footed_Albatross,test,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,Black_Footed_Albatross_0005_796090.jpg,1,Black_footed_Albatross,test,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,Black_Footed_Albatross_0006_796065.jpg,1,Black_footed_Albatross,test,5.jpg


In [17]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/5594 [00:00<?, ?it/s]

100%|██████████| 5594/5594 [00:00<00:00, 9015.41it/s]
100%|██████████| 400/400 [00:00<00:00, 8633.99it/s]
100%|██████████| 5794/5794 [00:00<00:00, 10162.35it/s]


## FGVC Aircraft

In [19]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/FGVC_Aircraft")

paths = sorted((folder / "raw-data" / "data" / "images").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

label_name_mapping = {}
with open(folder / "raw-data" / "data" / "variants.txt") as f:
    for index, line in enumerate(f.readlines()):
        label_name = line.strip()
        label_name_mapping[label_name] = index + 1

data_list = []
for path, split in [
    (folder / "raw-data" / "data" / "images_variant_train.txt", "train"),
    (folder / "raw-data" / "data" / "images_variant_val.txt", "val"),
    (folder / "raw-data" / "data" / "images_variant_test.txt", "test"),
]:
    with open(path) as f:
        for line in f.readlines():
            image_name = line.strip().split(" ", 1)[0]
            data_list.append(
                {
                    "image_name": image_name + ".jpg",
                    "label": label_name_mapping[line[len(image_name) + 1 :].strip()],
                    "label_name": line[len(image_name) + 1 :].strip(),
                    "split": split,
                }
            )

df = pd.merge(df, pd.DataFrame(data_list), on="image_name")

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

10000
10000


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,0034309.jpg,56,DC-8,val,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,0034958.jpg,3,737-200,val,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,0037511.jpg,57,DC-9-30,val,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,0037512.jpg,3,737-200,test,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,0038598.jpg,86,MD-11,train,5.jpg


In [20]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/3334 [00:00<?, ?it/s]

100%|██████████| 3334/3334 [00:00<00:00, 8808.10it/s]
100%|██████████| 3333/3333 [00:00<00:00, 8868.60it/s]
100%|██████████| 3333/3333 [00:00<00:00, 10416.36it/s]


## Food-101

In [28]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Food-101")

paths = sorted((folder / "raw-data" / "images").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)

split_mapping = {}
with open(folder / "raw-data" / "meta" / "test.txt") as f:
    for line in f.readlines():
        image_name = line.strip()
        split_mapping[image_name] = "test"

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])
df["split"] = df.apply(lambda x: split_mapping.get(f"{x['label_name']}/{x['image_name'].split('.')[0]}", "train"), axis=1)

val_df = df[df["split"] == "train"].groupby("label").sample(n=10, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

101000
101000


Unnamed: 0,image_path,image_name,label_name,label,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,1005649.jpg,apple_pie,1,train,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,1011328.jpg,apple_pie,1,test,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,101251.jpg,apple_pie,1,test,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,1014775.jpg,apple_pie,1,train,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,1026328.jpg,apple_pie,1,train,5.jpg


In [29]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 74740/74740 [00:08<00:00, 8696.19it/s]
100%|██████████| 1010/1010 [00:00<00:00, 8697.82it/s]
100%|██████████| 25250/25250 [00:02<00:00, 9184.37it/s]


## NABirds

In [31]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/NABirds")

paths = sorted((folder / "raw-data" / "images").rglob("*.*"))
print(len(paths))

label_name_mapping = {}
with open(folder / "raw-data" / "classes.txt") as f:
    for line in f.readlines():
        label = line.strip().split(" ", 1)[0]
        label_name = line[len(label) + 1 :].strip()
        label_name_mapping[int(label)] = label_name

split_mapping = {}
with open(folder / "raw-data" / "train_test_split.txt") as f:
    for line in f.readlines():
        image_name, split = line.strip().split(" ")
        split_mapping["".join(image_name.split("-"))] = {0: "test", 1: "train"}[int(split)]

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name) for path in paths],
    }
)
df["label_name"] = df["label"].apply(lambda x: label_name_mapping[x])
df["split"] = df["image_name"].apply(lambda x: split_mapping[x.split(".")[0]])

val_df = df[df["split"] == "train"].groupby("label").apply(lambda x: x.sample(n=2 if len(x) < 15 else 5, random_state=42), include_groups=False)
df.loc[val_df.index.get_level_values(1), "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

48562
48562


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,01f53d6bf5e449438d2bb79e0854bca4.jpg,295,Common Eider (Adult male),val,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,074a068d75404dfc9e37bffc8b37265e.jpg,295,Common Eider (Adult male),test,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,0daddfcbc9a54170ac06402bffeff37c.jpg,295,Common Eider (Adult male),train,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,19371d9dd2874202b9c7948a5543ed2e.jpg,295,Common Eider (Adult male),test,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,1fddd7c3b1b242eba5c020aaad4fb429.jpg,295,Common Eider (Adult male),train,5.jpg


In [32]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 21202/21202 [00:02<00:00, 8608.51it/s]
100%|██████████| 2727/2727 [00:00<00:00, 8340.88it/s]
100%|██████████| 24633/24633 [00:02<00:00, 9790.49it/s] 


## Oxford Flowers

In [34]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Oxford_Flowers")

paths = sorted((folder / "raw-data" / "jpg").rglob("*.*"))
print(len(paths))

imagelabels = scipy.io.loadmat(folder / "raw-data" / "imagelabels.mat")
setid = scipy.io.loadmat(folder / "raw-data" / "setid.mat")

label_name_df = pd.read_csv(folder / "raw-data" / "oxford_flower_102_name.csv")
label_name_mapping = {int(row["Index"]) + 1: row["Name"] for _, row in label_name_df.iterrows()}

split_mapping = {}
for i in setid["trnid"].flatten().tolist():
    split_mapping[i] = "train"
for i in setid["valid"].flatten().tolist():
    split_mapping[i] = "val"
for i in setid["tstid"].flatten().tolist():
    split_mapping[i] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)
df["label"] = imagelabels["labels"].flatten()
df["label_name"] = df["label"].apply(lambda x: label_name_mapping[x])
df["split"] = df["image_name"].apply(lambda x: split_mapping[int(x.split("_")[-1].split(".")[0])])

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

8189
8189


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,image_00001.jpg,77,passion flower,test,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,image_00002.jpg,77,passion flower,test,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,image_00003.jpg,77,passion flower,test,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,image_00004.jpg,77,passion flower,test,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,image_00005.jpg,77,passion flower,test,5.jpg


In [35]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/1020 [00:00<?, ?it/s]

100%|██████████| 1020/1020 [00:00<00:00, 9009.07it/s]
100%|██████████| 1020/1020 [00:00<00:00, 8825.78it/s]
100%|██████████| 6149/6149 [00:00<00:00, 9957.09it/s] 


## Oxford Pets

In [36]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Oxford_Pets")

paths = sorted((folder / "raw-data" / "images").rglob("*.*"))
print(len(paths))

split_mapping = {}
with open(folder / "raw-data" / "annotations" / "test.txt") as f:
    for line in f.readlines():
        image_name = line.strip().split(" ")[0]
        split_mapping[image_name] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": ["_".join(path.name.split("_")[:-1]) for path in paths],
    }
)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])
df["split"] = df.apply(lambda x: split_mapping.get(f"{x['image_name'].split('.')[0]}", "train"), axis=1)

val_df = df[df["split"] == "train"].groupby("label").sample(n=10, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

7393
7393


Unnamed: 0,image_path,image_name,label_name,label,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,Abyssinian_1.jpg,Abyssinian,1,val,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,Abyssinian_10.jpg,Abyssinian,1,train,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,Abyssinian_100.jpg,Abyssinian,1,train,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,Abyssinian_100.mat,Abyssinian,1,train,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,Abyssinian_101.jpg,Abyssinian,1,train,5.jpg


In [37]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/3354 [00:00<?, ?it/s]

100%|██████████| 3354/3354 [00:00<00:00, 8478.16it/s]
100%|██████████| 370/370 [00:00<00:00, 8807.41it/s]
100%|██████████| 3669/3669 [00:00<00:00, 10323.87it/s]


## RESISC45

In [39]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/RESISC45")

paths = sorted((folder / "raw-data" / "Images").rglob("*.jpg"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])

df["split"] = "train"
test_df = df[df["split"] == "train"].groupby("label").sample(n=50, random_state=42)
df.loc[test_df.index, "split"] = "test"
val_df = df[df["split"] == "train"].groupby("label").sample(n=50, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

31500
31500


Unnamed: 0,image_path,image_name,label_name,label,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,airplane_001.jpg,airplane,1,train,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,airplane_002.jpg,airplane,1,train,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,airplane_003.jpg,airplane,1,val,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,airplane_004.jpg,airplane,1,train,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,airplane_005.jpg,airplane,1,train,5.jpg


In [40]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 27000/27000 [00:03<00:00, 7849.68it/s]
100%|██████████| 2250/2250 [00:00<00:00, 8736.26it/s]
100%|██████████| 2250/2250 [00:00<00:00, 10092.35it/s]


## Standford Cars

In [48]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Stanford_Cars")

train_paths = sorted((folder / "raw-data" / "cars_train").rglob("*.*"))
print(len(train_paths))

test_paths = sorted((folder / "raw-data" / "cars_test").rglob("*.*"))
print(len(test_paths))

cars_meta = scipy.io.loadmat(folder / "raw-data" / "devkit" / "cars_meta.mat")
class_names = cars_meta["class_names"].flatten().tolist()
class_names = [name[0] for name in class_names]

cars_train_annos = scipy.io.loadmat(folder / "raw-data" / "devkit" / "cars_train_annos.mat")

train_df = pd.DataFrame(
    {
        "image_name": [f[-1][0] for f in cars_train_annos["annotations"].flatten().tolist()],
        "label": [f[-2][0][0] for f in cars_train_annos["annotations"].flatten().tolist()],
    }
)
train_df["label_name"] = train_df["label"].apply(lambda x: class_names[x - 1])
train_df["split"] = "train"

cars_test_annos = scipy.io.loadmat(folder / "raw-data" / "cars_test_annos_withlabels.mat")

test_df = pd.DataFrame(
    {
        "image_name": [f[-1][0] for f in cars_test_annos["annotations"].flatten().tolist()],
        "label": [f[-2][0][0] for f in cars_test_annos["annotations"].flatten().tolist()],
    }
)
test_df["label_name"] = test_df["label"].apply(lambda x: class_names[x - 1])
test_df["split"] = "test"

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
).merge(train_df, on="image_name")

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
).merge(test_df, on="image_name")

df = pd.concat([train_df, test_df], ignore_index=True)

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

8144
8041
16185


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,00001.jpg,14,Audi TTS Coupe 2012,train,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,00002.jpg,3,Acura TL Sedan 2012,train,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,00003.jpg,91,Dodge Dakota Club Cab 2007,train,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,00004.jpg,134,Hyundai Sonata Hybrid Sedan 2012,train,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,00005.jpg,106,Ford F-450 Super Duty Crew Cab 2012,train,5.jpg


In [49]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 7164/7164 [00:00<00:00, 8545.36it/s]
100%|██████████| 980/980 [00:00<00:00, 8730.38it/s]
100%|██████████| 8041/8041 [00:00<00:00, 10147.05it/s]


## Stanford Dogs

In [None]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/Stanford_Dogs")

paths = sorted((folder / "raw-data" / "Images").rglob("*.*"))
print(len(paths))

train_list = scipy.io.loadmat(folder / "raw-data" / "train_list.mat")
test_list = scipy.io.loadmat(folder / "raw-data" / "test_list.mat")

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

split_df = pd.concat(
    [
        pd.DataFrame(
            {
                "image_path": [folder / "raw-data" / "Images" / f[0] for f in train_list["file_list"].flatten().tolist()],
                "label": train_list["labels"].flatten().tolist(),
                "label_name": [f[0].split("/")[0] for f in train_list["file_list"].flatten().tolist()],
                "split": "train",
            }
        ),
        pd.DataFrame(
            {
                "image_path": [folder / "raw-data" / "Images" / f[0] for f in test_list["file_list"].flatten().tolist()],
                "label": test_list["labels"].flatten().tolist(),
                "label_name": [f[0].split("/")[0] for f in test_list["file_list"].flatten().tolist()],
                "split": "test",
            }
        ),
    ],
    ignore_index=True,
)

df = pd.merge(df, split_df, on="image_path")

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

20580
20580


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,n02085620_10074.jpg,1,n02085620-Chihuahua,test,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,n02085620_10131.jpg,1,n02085620-Chihuahua,test,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,n02085620_10621.jpg,1,n02085620-Chihuahua,train,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,n02085620_1073.jpg,1,n02085620-Chihuahua,test,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,n02085620_10976.jpg,1,n02085620-Chihuahua,train,5.jpg


In [45]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/11400 [00:00<?, ?it/s]

100%|██████████| 11400/11400 [00:01<00:00, 8625.35it/s]
100%|██████████| 600/600 [00:00<00:00, 8832.28it/s]
100%|██████████| 8580/8580 [00:00<00:00, 10148.10it/s]


## SUN397

In [56]:
folder = Path("/data/hanchong/open-source-data/PrimaryDatasets/SUN397")

paths = sorted((folder / "raw-data" / "Images").rglob("*.jpg"))
print(len(paths))

split_mapping = {}
with open(folder / "raw-data" / "Testing_01.txt") as f:
    for line in f.readlines():
        image_name = line.strip().split("/")[-1]
        split_mapping[image_name] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])
df["split"] = df.apply(lambda x: split_mapping.get(f"{x['image_name']}", "train"), axis=1)

val_df = df[df["split"] == "train"].groupby("label").sample(n=5, random_state=42)
df.loc[val_df.index, "split"] = "val"

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

108754
108754


Unnamed: 0,image_path,image_name,label_name,label,split,image_name_new
0,/data/hanchong/open-source-data/PrimaryDataset...,sun_aaalbzqrimafwbiv.jpg,abbey,1,test,1.jpg
1,/data/hanchong/open-source-data/PrimaryDataset...,sun_aaaulhwrhqgejnyt.jpg,abbey,1,train,2.jpg
2,/data/hanchong/open-source-data/PrimaryDataset...,sun_aacphuqehdodwawg.jpg,abbey,1,train,3.jpg
3,/data/hanchong/open-source-data/PrimaryDataset...,sun_aacyknxirsfolpon.jpg,abbey,1,train,4.jpg
4,/data/hanchong/open-source-data/PrimaryDataset...,sun_aadqayzjxpvmblix.jpg,abbey,1,train,5.jpg


In [58]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 87379/87379 [00:10<00:00, 8470.41it/s]
100%|██████████| 1525/1525 [00:00<00:00, 8408.82it/s]
100%|██████████| 19850/19850 [00:01<00:00, 10095.18it/s]


# Secondary Dataset

## iNaturalist 2017

In [2]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/iNaturalist_2017")

paths = sorted((folder / "raw-data" / "train_val_images").rglob("*.*"))
print(len(paths))

split_mapping = {}
with open(folder / "raw-data" / "train2017.json") as f:
    train2019 = json.load(f)
    for image in train2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "train"

with open(folder / "raw-data" / "val2017.json") as f:
    val2019 = json.load(f)
    for image in val2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "val"

label_name_mapping = {}
for category in train2019["categories"]:
    label_name_mapping[category["name"]] = int(category["id"])

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [label_name_mapping[path.parent.name] for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

test_paths = sorted((folder / "raw-data" / "test2017").rglob("*.*"))
print(len(test_paths))

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
        "label": -1,
        "label_name": "unknown",
    }
)
test_df["split"] = "test"

df = pd.concat([df, test_df], ignore_index=True)
df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

675170
182707
857877


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,0bb15d607734ee8ed27d8f45e88bf426.jpg,4745,Abudefduf saxatilis,val,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,10d4c817f42724f907bdf5f640d4d472.jpg,4745,Abudefduf saxatilis,val,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,19892e351c95f9ee4e25e8667fc3f7e9.jpg,4745,Abudefduf saxatilis,train,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,1faf0ba615708a021d080c5a8898dd8e.jpg,4745,Abudefduf saxatilis,train,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,206dae61fd527c7d47bf545eb1bddb36.jpg,4745,Abudefduf saxatilis,train,5.jpg


In [3]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 579184/579184 [01:07<00:00, 8622.95it/s]
100%|██████████| 95986/95986 [00:11<00:00, 8703.58it/s]
100%|██████████| 182707/182707 [00:18<00:00, 10011.06it/s]


## iNaturalist 2019

In [2]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/iNaturalist_2019")

paths = sorted((folder / "raw-data" / "train_val2019").rglob("*.*"))
print(len(paths))

with open(folder / "raw-data" / "categories.json") as f:
    categories = json.load(f)

split_mapping = {}
with open(folder / "raw-data" / "train2019.json") as f:
    train2019 = json.load(f)
    for image in train2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "train"

with open(folder / "raw-data" / "val2019.json") as f:
    val2019 = json.load(f)
    for image in val2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "val"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name) for path in paths],
    }
)
df["label_name"] = df["label"].apply(lambda x: categories[x]["name"])
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

test_paths = sorted((folder / "raw-data" / "test2019").rglob("*.*"))
print(len(test_paths))

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
        "label": -1,
        "label_name": "unknown",
    }
)
test_df["split"] = "test"

df = pd.concat([df, test_df], ignore_index=True)
df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

268243
35350
303593


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,0042d05b4ffbd5a1ce2fc56513a7777e.jpg,153,Lithobates sphenocephalus,train,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,006f69e838b87cfff3d12120795c4ada.jpg,153,Lithobates sphenocephalus,train,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,00c1bf968b20839ead054b3ab9eb1ce2.jpg,153,Lithobates sphenocephalus,train,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,011ae401924d635371dc70e059b9748b.jpg,153,Lithobates sphenocephalus,train,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,013862c72d6dc1344892e96af1130d76.jpg,153,Lithobates sphenocephalus,train,5.jpg


In [5]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 265213/265213 [00:29<00:00, 8887.93it/s]
100%|██████████| 3030/3030 [00:00<00:00, 8378.43it/s]
100%|██████████| 35350/35350 [00:03<00:00, 10294.20it/s]


## Butterflies and Moths 2019

In [2]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Butterflies_and_Moths_2019")

train_paths = sorted((folder / "raw-data" / "train").rglob("*.*"))
print(len(train_paths))
test_paths = sorted((folder / "raw-data" / "test").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
)
with open(folder / "raw-data" / "train_annotations.json") as f:
    metadata = json.load(f)
metadata_annotations = pd.DataFrame(metadata["annotations"])
metadata_images = pd.DataFrame(metadata["images"])
metadata_images.rename(columns={"id": "image_id"}, inplace=True)
metadata_categories = pd.DataFrame(metadata["categories"])
metadata_categories.rename(columns={"id": "category_id"}, inplace=True)

label = pd.merge(metadata_images[["file_name", "image_id"]], metadata_annotations[["image_id", "category_id"]], on="image_id")
label = pd.merge(label, metadata_categories[["category_id", "name"]], on="category_id", how="left")
label.rename(columns={"file_name": "image_name", "category_id": "label", "name": "label_name"}, inplace=True)
label.drop(columns=["image_id"], inplace=True)

train_df = pd.merge(train_df, label, on="image_name")
train_df["split"] = "train"

val_df = train_df[train_df["split"] == "train"].groupby("label").sample(n=2, random_state=42)
train_df.loc[val_df.index, "split"] = "val"

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([train_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

473438
59141
532579


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,0000031e2e9701e24d046c0dc3889bde.jpg,578,1aba6e966c53569690e4ab13431bd21b,train,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,00001246ce7f88ec292384abcc58d3e0.jpg,438,146b901c00383faa801c11a6e62f96a7,train,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,00006fd6c3cd2e525e2df491e157844d.jpg,3990,bb7feba170763050772fbcd2b2169756,train,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,0000a8ef4598a73544df0d42ca754a73.jpg,2964,894a43cd9ecb01a8c4f3e45255b34597,train,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,0000d2de78907e422f61ec502819e27b.jpg,1757,4f691417f90a25e616ec62c2a15baa84,train,5.jpg


In [3]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 462604/462604 [00:53<00:00, 8619.13it/s]
100%|██████████| 10834/10834 [00:01<00:00, 8298.54it/s]
100%|██████████| 59141/59141 [00:05<00:00, 10604.29it/s]


## Fruits and Vegetables

In [70]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Fruits_and_Vegetables")

paths = sorted((folder / "raw-data").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
        "split": [path.parent.parent.name for path in paths],
    }
)
df["split"] = df["split"].apply(lambda x: "val" if x == "validation" else x)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

3825
3825


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,Image_1.jpg,apple,test,0,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,Image_10.jpg,apple,test,0,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,Image_2.jpg,apple,test,0,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,Image_3.jpg,apple,test,0,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,Image_4.jpg,apple,test,0,5.jpg


In [71]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/3115 [00:00<?, ?it/s]

100%|██████████| 3115/3115 [00:00<00:00, 8982.63it/s]
100%|██████████| 351/351 [00:00<00:00, 8313.90it/s]
100%|██████████| 359/359 [00:00<00:00, 10584.68it/s]


## Fungi CLEF 2024

In [7]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Fungi_CLEF_2024")

train_paths = sorted((folder / "raw-data" / "DF20_300").rglob("*.*"))
print(len(train_paths))

val_paths = sorted((folder / "raw-data" / "DF21_300").rglob("*.*"))
print(len(val_paths))

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
)
train_df["split"] = "train"
train_label = pd.read_csv(folder / "raw-data" / "FungiCLEF2023_train_metadata_PRODUCTION.csv", usecols=["image_path", "class_id", "species"])
train_label.rename(columns={"image_path": "image_name", "class_id": "label", "species": "label_name"}, inplace=True)
train_label["image_name"] = train_label["image_name"].apply(lambda x: x[:-4] + ".jpg")
train_df = pd.merge(train_df, train_label, on="image_name")

val_df = pd.DataFrame(
    {
        "image_path": val_paths,
        "image_name": [path.name for path in val_paths],
    }
)
val_df["split"] = "val"
val_label = pd.read_csv(folder / "raw-data" / "FungiCLEF2023_val_metadata_PRODUCTION.csv", usecols=["image_path", "class_id", "species"])
val_label.rename(columns={"image_path": "image_name", "class_id": "label", "species": "label_name"}, inplace=True)
val_df = pd.merge(val_df, val_label, on="image_name")

val_df = val_df[val_df["label"] != -1]

df = pd.concat([train_df, val_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

295938
121057
334816


Unnamed: 0,image_path,image_name,split,label_name,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,2237851949-74654.jpg,train,Circinaria caesiocinerea,177,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,2237851951-222637.jpg,train,Polysporina simplex,1208,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,2237851957-297864.jpg,train,Collema tenax,419,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,2237851957-297865.jpg,train,Collema tenax,419,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,2237851963-0.jpg,train,Cladonia rangiformis,200,5.jpg


In [8]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 295938/295938 [00:34<00:00, 8646.60it/s]
100%|██████████| 38878/38878 [00:04<00:00, 8828.47it/s]


## Fungi CLEF 2025

In [9]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Fungi_CLEF_2025")

images_folder = folder / "raw-data" / "images" / "FungiTastic-FewShot"
metadata_folder = folder / "raw-data" / "metadata" / "FungiTastic-FewShot"

train_paths = sorted((images_folder / "train" / "300p").rglob("*.*"))
print(len(train_paths))
val_paths = sorted((images_folder / "val" / "300p").rglob("*.*"))
print(len(val_paths))
test_paths = sorted((images_folder / "test" / "300p").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
)
train_df["split"] = "train"
train_label = pd.read_csv(metadata_folder / "FungiTastic-FewShot-Train.csv", usecols=["filename", "category_id", "species"])
train_label.rename(columns={"filename": "image_name", "category_id": "label", "species": "label_name"}, inplace=True)
train_df = pd.merge(train_df, train_label, on="image_name")

val_df = pd.DataFrame(
    {
        "image_path": val_paths,
        "image_name": [path.name for path in val_paths],
    }
)
val_df["split"] = "val"
val_label = pd.read_csv(metadata_folder / "FungiTastic-FewShot-Val.csv", usecols=["filename", "category_id", "species"])
val_label.rename(columns={"filename": "image_name", "category_id": "label", "species": "label_name"}, inplace=True)
val_df = pd.merge(val_df, val_label, on="image_name")

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([train_df, val_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

7819
2285
1911
12015


Unnamed: 0,image_path,image_name,split,label_name,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,0-2237852042.JPG,train,Gloeocystidiellum clavuligerum,916,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,0-2237852122.JPG,train,Plicaria endocarpoides,1749,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,0-2237852952.JPG,train,Dolichousnea longissima,2375,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,0-2237853300.JPG,train,Cyathicula amenti,621,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,0-2237853303.JPG,train,Rutstroemia elatina,2028,5.jpg


In [10]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

 10%|█         | 805/7819 [00:00<00:00, 8045.63it/s]

100%|██████████| 7819/7819 [00:00<00:00, 8692.12it/s]
100%|██████████| 2285/2285 [00:00<00:00, 8878.22it/s]
100%|██████████| 1911/1911 [00:00<00:00, 10612.47it/s]


## Herbarium 2021

In [11]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Herbarium_2021")

train_paths = sorted((folder / "raw-data" / "train" / "images").rglob("*.*"))
print(len(train_paths))

test_paths = sorted((folder / "raw-data" / "test" / "images").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": [str(path.relative_to(folder / "raw-data" / "train")) for path in train_paths],
        "image_name": [path.name for path in train_paths],
    }
)
with open(folder / "raw-data" / "train" / "metadata.json") as f:
    metadata = json.load(f)
metadata_annotations = pd.DataFrame(metadata["annotations"])
metadata_images = pd.DataFrame(metadata["images"])
metadata_categories = pd.DataFrame(metadata["categories"])

label = pd.merge(metadata_images[["file_name", "id"]], metadata_annotations[["id", "category_id"]], on="id", how="left")
label.drop(columns=["id"], inplace=True)
label = pd.merge(label, metadata_categories[["id", "name"]], left_on="category_id", right_on="id", how="left")
label.rename(columns={"file_name": "image_path", "category_id": "label", "name": "label_name"}, inplace=True)
label.drop(columns=["id"], inplace=True)

train_df = pd.merge(train_df, label, on="image_path")
train_df["image_path"] = folder / "raw-data" / "train" / train_df["image_path"]
train_df["split"] = "train"

val_df = train_df[train_df["split"] == "train"].groupby("label").sample(n=2, random_state=42)
train_df.loc[val_df.index, "split"] = "val"

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([train_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

2257759
243020
2500779


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,1360648.jpg,0,Aa calceata (Rchb.f.) Schltr.,val,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,1433074.jpg,0,Aa calceata (Rchb.f.) Schltr.,val,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,1703060.jpg,0,Aa calceata (Rchb.f.) Schltr.,train,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,1104517.jpg,1,Aa matthewsii (Rchb.f.) Schltr.,train,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,1486090.jpg,1,Aa matthewsii (Rchb.f.) Schltr.,val,5.jpg


In [12]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 2128759/2128759 [04:00<00:00, 8861.55it/s]
100%|██████████| 129000/129000 [00:15<00:00, 8252.83it/s]
100%|██████████| 243020/243020 [00:23<00:00, 10217.18it/s]


## Herbarium 2022

In [13]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Herbarium_2022")

train_paths = sorted((folder / "raw-data" / "train_images").rglob("*.*"))
print(len(train_paths))

test_paths = sorted((folder / "raw-data" / "test_images").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": [str(path.relative_to(folder / "raw-data" / "train_images")) for path in train_paths],
        "image_name": [path.name for path in train_paths],
    }
)
with open(folder / "raw-data" / "train_metadata.json") as f:
    metadata = json.load(f)
metadata_annotations = pd.DataFrame(metadata["annotations"])
metadata_images = pd.DataFrame(metadata["images"])
metadata_categories = pd.DataFrame(metadata["categories"])

label = pd.merge(metadata_images[["file_name", "image_id"]], metadata_annotations[["image_id", "category_id"]], on="image_id", how="left")
label = pd.merge(label, metadata_categories[["category_id", "species"]], on="category_id", how="left")
label.rename(columns={"file_name": "image_path", "category_id": "label", "species": "label_name"}, inplace=True)
label.drop(columns=["image_id"], inplace=True)

train_df = pd.merge(train_df, label, on="image_path")
train_df["image_path"] = folder / "raw-data" / "train_images" / train_df["image_path"]
train_df["split"] = "train"

val_df = train_df[train_df["split"] == "train"].groupby("label").sample(n=2, random_state=42)
train_df.loc[val_df.index, "split"] = "val"

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([train_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

839772
210407
1050179


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,00000__001.jpg,0,amabilis,val,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,00000__002.jpg,0,amabilis,train,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,00000__003.jpg,0,amabilis,train,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,00000__004.jpg,0,amabilis,train,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,00000__005.jpg,0,amabilis,train,5.jpg


In [14]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 808770/808770 [01:30<00:00, 8895.42it/s]
100%|██████████| 31002/31002 [00:03<00:00, 8242.33it/s]
100%|██████████| 210407/210407 [00:20<00:00, 10231.87it/s]


## Hotel-ID to Combat Human Trafficking 2021

In [31]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Hotel-ID_2021")

paths = sorted((folder / "raw-data" / "train_images").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)
label = pd.read_csv(folder / "raw-data" / "train.csv", usecols=["image", "hotel_id"])
label.drop_duplicates(subset=["image"], inplace=True)
label.rename(columns={"hotel_id": "label_name", "image": "image_name"}, inplace=True)

df = pd.merge(df, label, on="image_name")
df["split"] = "train"

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])

val_df = df[df["split"] == "train"].groupby("label_name").apply(lambda x: x.sample(n=1 if len(x) < 4 else 2, random_state=42), include_groups=False)
df.loc[val_df.index.get_level_values(1), "split"] = "val"

test_paths = sorted((folder / "raw-data" / "test_images").rglob("*.*"))
print(len(test_paths))

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

97554
3
97557


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,800a4a6b6f6d2df6.jpg,12036,val,1405,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,8026797d7b65c4b2.jpg,36739,val,4334,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,80286fe316ff68a5.jpg,13494,train,1608,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,8028cf47a37b369d.jpg,35733,train,4200,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,802af4d04faf14df.jpg,3876,train,468,5.jpg


In [32]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 82051/82051 [00:09<00:00, 8767.53it/s]
100%|██████████| 15503/15503 [00:01<00:00, 7844.44it/s]
100%|██████████| 3/3 [00:00<00:00, 2302.45it/s]


## Hotel-ID to Combat Human Trafficking 2022

In [33]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Hotel-ID_2022")

paths = sorted((folder / "raw-data" / "train_images").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)

df["split"] = "train"

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])

val_df = df[df["split"] == "train"].groupby("label_name").apply(lambda x: x.sample(n=1 if len(x) < 4 else 2, random_state=42), include_groups=False)
df.loc[val_df.index.get_level_values(1), "split"] = "val"

test_paths = sorted((folder / "raw-data" / "test_images").rglob("*.*"))
print(len(test_paths))

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

44702
1
44703


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,000003766.jpg,100055,train,0,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,000003767.jpg,100055,val,0,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,000003768.jpg,100055,train,0,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,000003769.jpg,100055,train,0,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,000003770.jpg,100055,val,0,5.jpg


In [34]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 38530/38530 [00:04<00:00, 8924.42it/s]
100%|██████████| 6172/6172 [00:00<00:00, 8200.28it/s]
100%|██████████| 1/1 [00:00<00:00, 1916.08it/s]


## iCassava 2019

In [19]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/iCassava_2019")

train_paths = sorted((folder / "raw-data" / "train").rglob("*.*"))
print(len(train_paths))

test_paths = sorted((folder / "raw-data" / "test").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
        "label_name": [path.parent.name for path in train_paths],
    }
)
train_df["split"] = "train"

label_name_mapping = {}
for index, label_name in enumerate(sorted(train_df["label_name"].unique())):
    label_name_mapping[label_name] = index
train_df["label"] = train_df["label_name"].apply(lambda x: label_name_mapping[x])

val_df = train_df[train_df["split"] == "train"].groupby("label_name").sample(n=20, random_state=42)
train_df.loc[val_df.index, "split"] = "val"

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([train_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

5656
3774
9430


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,train-cbb-0.jpg,cbb,train,0,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,train-cbb-1.jpg,cbb,train,0,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,train-cbb-10.jpg,cbb,train,0,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,train-cbb-100.jpg,cbb,train,0,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,train-cbb-101.jpg,cbb,train,0,5.jpg


In [20]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/5556 [00:00<?, ?it/s]

100%|██████████| 5556/5556 [00:00<00:00, 8222.15it/s]
100%|██████████| 100/100 [00:00<00:00, 8733.22it/s]
100%|██████████| 3774/3774 [00:00<00:00, 9880.84it/s]


## iDesigner 2019

In [21]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/iDesigner_2019")

train_paths = sorted((folder / "raw-data" / "designer_image_train_v2_cropped").rglob("*.*"))
print(len(train_paths))

test_paths = sorted((folder / "raw-data" / "designer_image_test").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
        "label_name": [path.parent.name for path in train_paths],
    }
)
train_df["split"] = "train"

label_name_mapping = {}
for index, label_name in enumerate(sorted(train_df["label_name"].unique())):
    label_name_mapping[label_name] = index
train_df["label"] = train_df["label_name"].apply(lambda x: label_name_mapping[x])

val_df = train_df[train_df["split"] == "train"].groupby("label_name").sample(n=10, random_state=42)
train_df.loc[val_df.index, "split"] = "val"

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([train_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

45107
5438
50545


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,FW08DLR_McQueen_0014.png,alexander mcqueen,train,0,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,FW08DLR_McQueen_0015.png,alexander mcqueen,train,0,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,FW08DLR_McQueen_0017.png,alexander mcqueen,train,0,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,FW08DLR_McQueen_0031.png,alexander mcqueen,train,0,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,FW08DLR_McQueen_0042.png,alexander mcqueen,train,0,5.jpg


In [22]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 44607/44607 [00:05<00:00, 8612.68it/s]
100%|██████████| 500/500 [00:00<00:00, 8764.28it/s]
100%|██████████| 5438/5438 [00:00<00:00, 10367.06it/s]


## iFood 2019

In [23]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/iFood_2019")

train_paths = sorted((folder / "raw-data" / "train_set").rglob("*.*"))
print(len(train_paths))
val_paths = sorted((folder / "raw-data" / "val_set").rglob("*.*"))
print(len(val_paths))
test_paths = sorted((folder / "raw-data" / "test_set").rglob("*.*"))
print(len(test_paths))

train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
)
train_label = pd.read_csv(folder / "raw-data" / "train_labels.csv")
train_label.rename(columns={"img_name": "image_name"}, inplace=True)
train_df = pd.merge(train_df, train_label, on="image_name")
train_df["split"] = "train"
train_df["label_name"] = train_df["label"].apply(lambda x: f"C{x:03d}")

val_df = pd.DataFrame(
    {
        "image_path": val_paths,
        "image_name": [path.name for path in val_paths],
    }
)
val_label = pd.read_csv(folder / "raw-data" / "val_labels.csv")
val_label.rename(columns={"img_name": "image_name"}, inplace=True)
val_df = pd.merge(val_df, val_label, on="image_name")
val_df["split"] = "val"
val_df["label_name"] = val_df["label"].apply(lambda x: f"C{x:03d}")

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["label"] = -1
test_df["label_name"] = "unknown"
test_df["split"] = "test"

df = pd.concat([train_df, val_df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

118475
11994
28377
158846


Unnamed: 0,image_path,image_name,label,split,label_name,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,train_000000.jpg,94,train,C094,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,train_000001.jpg,94,train,C094,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,train_000002.jpg,94,train,C094,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,train_000003.jpg,94,train,C094,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,train_000004.jpg,94,train,C094,5.jpg


In [24]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 118475/118475 [00:13<00:00, 8633.49it/s]
100%|██████████| 11994/11994 [00:01<00:00, 8555.68it/s]
100%|██████████| 28377/28377 [00:02<00:00, 10100.23it/s]


## Plant CLEF 2024

In [25]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Plant_CLEF_2024")

paths = sorted((folder / "raw-data" / "images_max_side_800").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)
metadata = pd.read_csv(folder / "raw-data" / "PlantCLEF2024singleplanttrainingdata.csv", delimiter=";", usecols=["image_name", "species_id", "species", "learn_tag"])
metadata.rename(columns={"species_id": "label", "species": "label_name", "learn_tag": "split"}, inplace=True)
df = pd.merge(df, metadata, on="image_name")

label_mapping = {}
for index, label in enumerate(sorted(df["label"].unique())):
    label_mapping[label] = index
df["label"] = df["label"].apply(lambda x: label_mapping[x])

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

1408033
1408033


Unnamed: 0,image_path,image_name,label,label_name,split,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,0070793945bc6db2c597387006c5425751204baa.jpg,0,Lactuca virosa L.,train,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,009fa47428093ac9f2d7df162ed1da82300797f2.jpg,0,Lactuca virosa L.,train,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,00af911161109b7ce1b5f89fbc7a50668d98eb89.jpg,0,Lactuca virosa L.,train,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,00ee391ea705c86eaf954acbf08567504e4f347e.jpg,0,Lactuca virosa L.,train,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,00f5a6720aaeeaeb29818b5442b80a7714041df8.jpg,0,Lactuca virosa L.,train,5.jpg


In [26]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 1308899/1308899 [02:28<00:00, 8812.96it/s]
100%|██████████| 51194/51194 [00:05<00:00, 8910.90it/s]
100%|██████████| 47940/47940 [00:04<00:00, 10265.68it/s]


## Snake CLEF 2024

In [27]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Snake_CLEF_2024")

train_paths = sorted((folder / "raw-data" / "SnakeCLEF2023-train-medium_size").rglob("*.*"))
print(len(train_paths))

val_paths = sorted((folder / "raw-data" / "SnakeCLEF2023-val-medium_size").rglob("*.*"))
print(len(val_paths))

train_df = pd.DataFrame(
    {
        "image_path": [str(path.relative_to(folder / "raw-data" / "SnakeCLEF2023-train-medium_size")) for path in train_paths],
        "image_name": [path.name for path in train_paths],
        "label_name": [path.parent.name for path in train_paths],
    }
)
train_df["split"] = "train"
train_label = pd.read_csv(folder / "raw-data" / "SnakeCLEF2023-TrainMetadata-iNat.csv", usecols=["image_path", "class_id"])
train_label.rename(columns={"class_id": "label"}, inplace=True)
train_label.drop_duplicates(subset=["image_path"], inplace=True)
train_df = pd.merge(train_df, train_label, on="image_path")
train_df["image_path"] = folder / "raw-data" / "SnakeCLEF2023-train-medium_size" / train_df["image_path"]

val_df = pd.DataFrame(
    {
        "image_path": [str(path.relative_to(folder / "raw-data" / "SnakeCLEF2023-val-medium_size")) for path in val_paths],
        "image_name": [path.name for path in val_paths],
        "label_name": [path.parent.name for path in val_paths],
    }
)
val_df["split"] = "val"
val_label = pd.read_csv(folder / "raw-data" / "SnakeCLEF2023-ValMetadata.csv", usecols=["image_path", "class_id"])
val_label.rename(columns={"class_id": "label"}, inplace=True)
val_label.drop_duplicates(subset=["image_path"], inplace=True)
val_df = pd.merge(val_df, val_label, on="image_path")
val_df["image_path"] = folder / "raw-data" / "SnakeCLEF2023-val-medium_size" / val_df["image_path"]

df = pd.concat([train_df, val_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

154140
14117
168257


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,59067968.jpg,Amphiesma_stolatum,train,66,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,168477.JPG,Aspidelaps_lubricus,train,95,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,168482.JPG,Bitis_caudalis,train,157,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,1358706.JPG,Bitis_peringueyi,train,161,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,111486989.jpeg,Coronella_austriaca,train,387,5.jpg


In [28]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

100%|██████████| 154140/154140 [00:17<00:00, 8865.69it/s]
100%|██████████| 14117/14117 [00:01<00:00, 8801.02it/s]


## Sorghum-100 Cultivars 2022

In [29]:
folder = Path("/data/hanchong/open-source-data/SecondaryDatasets/Sorghum-100_Cultivars_2022")

paths = sorted((folder / "raw-data" / "train_images").rglob("*.*"))
print(len(paths))

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

label = pd.read_csv(folder / "raw-data" / "train_cultivar_mapping.csv")
label.rename(columns={"cultivar": "label_name", "image": "image_name"}, inplace=True)

df = pd.merge(df, label, on="image_name")
df["split"] = "train"

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])

val_df = df[df["split"] == "train"].groupby("label").sample(n=10, random_state=42)
df.loc[val_df.index, "split"] = "val"

test_paths = sorted((folder / "raw-data" / "test").rglob("*.*"))
print(len(test_paths))

test_df = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
)
test_df["split"] = "test"
test_df["label"] = -1
test_df["label_name"] = "unknown"

df = pd.concat([df, test_df], ignore_index=True)

df["image_name_new"] = (df.index + 1).astype(str) + ".jpg"

print(len(df))
df.head()

22193
23639
45832


Unnamed: 0,image_path,image_name,label_name,split,label,image_name_new
0,/data/hanchong/open-source-data/SecondaryDatas...,2017-06-01__10-26-27-479.png,PI_155760,train,32,1.jpg
1,/data/hanchong/open-source-data/SecondaryDatas...,2017-06-01__10-26-28-944.png,PI_155760,train,32,2.jpg
2,/data/hanchong/open-source-data/SecondaryDatas...,2017-06-01__10-26-30-474.png,PI_155760,val,32,3.jpg
3,/data/hanchong/open-source-data/SecondaryDatas...,2017-06-01__10-26-37-978.png,PI_152751,train,12,4.jpg
4,/data/hanchong/open-source-data/SecondaryDatas...,2017-06-01__10-26-39-476.png,PI_152751,train,12,5.jpg


In [30]:
output_folder = folder / "processed-data"
output_folder.mkdir(parents=True, exist_ok=True)

df[df["split"] != "test"][["label", "label_name"]].drop_duplicates().sort_values(by="label").to_csv(output_folder / "labels.txt", index=False, header=False)
df["split"].value_counts().rename("count").reset_index().to_csv(output_folder / "split-counts.csv", index=False)

for split in ["train", "val", "test"]:
    _df = df[df["split"] == split]
    _df.to_csv(output_folder / f"df_{split}.csv", index=False)
    _df.groupby(["label", "label_name"]).size().rename("count").reset_index().sort_values(by="label").to_csv(output_folder / f"category-counts_{split}.csv", index=False)

    split_output_folder = output_folder / split
    split_output_folder.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(_df.iterrows(), total=len(_df)):
        if split == "test":
            output_path = split_output_folder / row["image_name_new"]
        else:
            output_path = split_output_folder / str(row["label"]) / row["image_name_new"]
            output_path.parent.mkdir(parents=True, exist_ok=True)

        if output_path.exists():
            continue

        output_path.symlink_to(row["image_path"])

  0%|          | 0/21193 [00:00<?, ?it/s]

100%|██████████| 21193/21193 [00:02<00:00, 8979.66it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9105.50it/s]
100%|██████████| 23639/23639 [00:02<00:00, 10439.65it/s]
