In [1]:
import json
import scipy.io
import pandas as pd
from pathlib import Path

# Oxford Flowers

In [39]:
paths = Path("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/jpg").rglob("*.jpg")
paths = sorted(paths)

imagelabels = scipy.io.loadmat("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/imagelabels.mat")
setid = scipy.io.loadmat("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/setid.mat")

label_name_df = pd.read_csv("/data/hanchong/open-source-data/Oxford_Flowers/raw-data/oxford_flower_102_name.csv")
label_name_mapping = {int(row["Index"]) + 1: row["Name"] for _, row in label_name_df.iterrows()}

split_mapping = {}
for i in setid["trnid"].flatten().tolist():
    split_mapping[i] = "train"
for i in setid["valid"].flatten().tolist():
    split_mapping[i] = "valid"
for i in setid["tstid"].flatten().tolist():
    split_mapping[i] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)
df["label"] = imagelabels["labels"].flatten()
df["label_name"] = df["label"].apply(lambda x: label_name_mapping[x])
df["split"] = df["image_name"].apply(lambda x: split_mapping[int(x.split("_")[-1].split(".")[0])])

print(len(df))
df.head()

8189


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/Oxford_Flowers...,image_00001.jpg,77,passion flower,test
1,/data/hanchong/open-source-data/Oxford_Flowers...,image_00002.jpg,77,passion flower,test
2,/data/hanchong/open-source-data/Oxford_Flowers...,image_00003.jpg,77,passion flower,test
3,/data/hanchong/open-source-data/Oxford_Flowers...,image_00004.jpg,77,passion flower,test
4,/data/hanchong/open-source-data/Oxford_Flowers...,image_00005.jpg,77,passion flower,test


In [40]:
df["label"].nunique()

102

In [41]:
df["label"].value_counts()

label
51    258
77    251
46    196
73    194
89    184
     ... 
27     40
3      40
1      40
7      40
45     40
Name: count, Length: 102, dtype: int64

In [42]:
print(len(setid["trnid"].flatten()))
print(len(setid["valid"].flatten()))
print(len(setid["tstid"].flatten()))

1020
1020
6149


In [43]:
df["split"].value_counts()

split
test     6149
valid    1020
train    1020
Name: count, dtype: int64

# CUB200-2011

In [3]:
paths = Path("/data/hanchong/open-source-data/CUB200-2011/raw-data/images").rglob("*.jpg")
paths = sorted(paths)

index_split_df = pd.read_csv("/data/hanchong/open-source-data/CUB200-2011/raw-data/train_test_split.txt", sep=" ", header=None)
index_split_df.columns = ["Index", "Split"]

index_image_df = pd.read_csv("/data/hanchong/open-source-data/CUB200-2011/raw-data/images.txt", sep=" ", header=None)
index_image_df.columns = ["Index", "Image"]

split_df = pd.merge(index_split_df, index_image_df, on="Index")
split_mapping = {row["Image"].split("/")[-1]: {0: "test", 1: "train"}[row["Split"]] for _, row in split_df.iterrows()}

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name.split(".")[0]) for path in paths],
        "label_name": [path.parent.name.split(".")[-1] for path in paths],
    }
)
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

print(len(df))
df.head()

11788


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0001_796111.jpg,1,Black_footed_Albatross,test
1,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0002_55.jpg,1,Black_footed_Albatross,test
2,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0003_796136.jpg,1,Black_footed_Albatross,test
3,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0005_796090.jpg,1,Black_footed_Albatross,test
4,/data/hanchong/open-source-data/CUB200-2011/ra...,Black_Footed_Albatross_0006_796065.jpg,1,Black_footed_Albatross,test


In [4]:
df["label"].nunique()

200

In [5]:
df["label"].value_counts()

label
1      60
2      60
4      60
21     60
13     60
       ..
105    49
8      48
18     45
5      44
6      41
Name: count, Length: 200, dtype: int64

In [6]:
split_df["Split"].value_counts()

Split
1    5994
0    5794
Name: count, dtype: int64

In [7]:
df["split"].value_counts()

split
train    5994
test     5794
Name: count, dtype: int64

# Stanford Dogs

In [8]:
folder = Path("/data/hanchong/open-source-data/Stanford_Dogs/raw-data/Images")
paths = folder.rglob("*.jpg")
paths = sorted(paths)

train_list = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Dogs/raw-data/train_list.mat")
test_list = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Dogs/raw-data/test_list.mat")

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

split_df = pd.concat(
    [
        pd.DataFrame(
            {
                "image_path": [folder / f[0] for f in train_list["file_list"].flatten().tolist()],
                "label": train_list["labels"].flatten().tolist(),
                "label_name": [f[0].split("/")[0] for f in train_list["file_list"].flatten().tolist()],
                "split": "train",
            }
        ),
        pd.DataFrame(
            {
                "image_path": [folder / f[0] for f in test_list["file_list"].flatten().tolist()],
                "label": test_list["labels"].flatten().tolist(),
                "label_name": [f[0].split("/")[0] for f in test_list["file_list"].flatten().tolist()],
                "split": "test",
            }
        ),
    ],
    ignore_index=True,
)

df = pd.merge(df, split_df, on="image_path")

print(len(df))
df.head()

20580


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10074.jpg,1,n02085620-Chihuahua,test
1,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10131.jpg,1,n02085620-Chihuahua,test
2,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10621.jpg,1,n02085620-Chihuahua,train
3,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_1073.jpg,1,n02085620-Chihuahua,test
4,/data/hanchong/open-source-data/Stanford_Dogs/...,n02085620_10976.jpg,1,n02085620-Chihuahua,train


In [9]:
df["label"].nunique()

120

In [10]:
df["label"].value_counts()

label
3      252
10     239
27     232
108    219
89     218
      ... 
74     150
101    150
119    150
4      149
18     148
Name: count, Length: 120, dtype: int64

In [11]:
split_df["split"].value_counts()

split
train    12000
test      8580
Name: count, dtype: int64

In [12]:
df["split"].value_counts()

split
train    12000
test      8580
Name: count, dtype: int64

# Standford Cars

In [52]:
folder = Path("/data/hanchong/open-source-data/Stanford_Cars/raw-data")

train_paths = (folder / "cars_train").rglob("*.jpg")
train_paths = sorted(train_paths)

test_paths = (folder / "cars_test").rglob("*.jpg")
test_paths = sorted(test_paths)

cars_meta = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Cars/raw-data/devkit/cars_meta.mat")
class_names = cars_meta["class_names"].flatten().tolist()
class_names = [name[0] for name in class_names]

cars_train_annos = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Cars/raw-data/devkit/cars_train_annos.mat")

train_df = pd.DataFrame(
    {
        "image_name": [f[-1][0] for f in cars_train_annos["annotations"].flatten().tolist()],
        "label": [f[-2][0][0] for f in cars_train_annos["annotations"].flatten().tolist()],
    }
)
train_df["label_name"] = train_df["label"].apply(lambda x: class_names[x - 1])
train_df["split"] = "train"

cars_test_annos = scipy.io.loadmat("/data/hanchong/open-source-data/Stanford_Cars/raw-data/cars_test_annos_withlabels.mat")

test_df = pd.DataFrame(
    {
        "image_name": [f[-1][0] for f in cars_test_annos["annotations"].flatten().tolist()],
        "label": [f[-2][0][0] for f in cars_test_annos["annotations"].flatten().tolist()],
    }
)
test_df["label_name"] = test_df["label"].apply(lambda x: class_names[x - 1])
test_df["split"] = "test"


train_df = pd.DataFrame(
    {
        "image_path": train_paths,
        "image_name": [path.name for path in train_paths],
    }
).merge(train_df, on="image_name")

test_df_2 = pd.DataFrame(
    {
        "image_path": test_paths,
        "image_name": [path.name for path in test_paths],
    }
).merge(test_df, on="image_name")

df = pd.concat([train_df, test_df], ignore_index=True)

print(len(df))
df.head()

16185


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/Stanford_Cars/...,00001.jpg,14,Audi TTS Coupe 2012,train
1,/data/hanchong/open-source-data/Stanford_Cars/...,00002.jpg,3,Acura TL Sedan 2012,train
2,/data/hanchong/open-source-data/Stanford_Cars/...,00003.jpg,91,Dodge Dakota Club Cab 2007,train
3,/data/hanchong/open-source-data/Stanford_Cars/...,00004.jpg,134,Hyundai Sonata Hybrid Sedan 2012,train
4,/data/hanchong/open-source-data/Stanford_Cars/...,00005.jpg,106,Ford F-450 Super Duty Crew Cab 2012,train


In [53]:
df["label"].nunique()

196

In [54]:
df["label"].value_counts()

label
119    136
79      97
161     96
167     95
144     93
      ... 
175     61
64      59
158     58
99      55
136     48
Name: count, Length: 196, dtype: int64

In [55]:
df["split"].value_counts()

split
train    8144
test     8041
Name: count, dtype: int64

# FGVC Aircraft

In [None]:
paths = Path("/data/hanchong/open-source-data/FGCV_Aircraft/raw-data/data/images").rglob("*.jpg")
paths = sorted(paths)

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
    }
)

label_name_mapping = {}
with open("/data/hanchong/open-source-data/FGCV_Aircraft/raw-data/data/variants.txt") as f:
    for index, line in enumerate(f.readlines()):
        label_name = line.strip()
        label_name_mapping[label_name] = index + 1

data_list = []
for path, split in [
    ("/data/hanchong/open-source-data/FGCV_Aircraft/raw-data/data/images_variant_train.txt", "train"),
    ("/data/hanchong/open-source-data/FGCV_Aircraft/raw-data/data/images_variant_val.txt", "val"),
    ("/data/hanchong/open-source-data/FGCV_Aircraft/raw-data/data/images_variant_test.txt", "test"),
]:
    with open(path) as f:
        for line in f.readlines():
            image_name = line.strip().split(" ", 1)[0]
            data_list.append(
                {
                    "image_name": image_name + ".jpg",
                    "label": label_name_mapping[line[len(image_name) + 1 :].strip()],
                    "label_name": line[len(image_name) + 1 :].strip(),
                    "split": split,
                }
            )


df = pd.merge(df, pd.DataFrame(data_list), on="image_name")

print(len(df))
df.head()

10000


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/FGCV_Aircraft/...,0034309.jpg,56,DC-8,val
1,/data/hanchong/open-source-data/FGCV_Aircraft/...,0034958.jpg,3,737-200,val
2,/data/hanchong/open-source-data/FGCV_Aircraft/...,0037511.jpg,57,DC-9-30,val
3,/data/hanchong/open-source-data/FGCV_Aircraft/...,0037512.jpg,3,737-200,test
4,/data/hanchong/open-source-data/FGCV_Aircraft/...,0038598.jpg,86,MD-11,train


In [120]:
df["label"].nunique()

100

In [121]:
df["label"].value_counts()

label
56    100
3     100
57    100
86    100
42    100
     ... 
59    100
93    100
67    100
34    100
66    100
Name: count, Length: 100, dtype: int64

In [122]:
df["split"].value_counts()

split
train    3334
val      3333
test     3333
Name: count, dtype: int64

# Birdsnap

In [2]:
paths = Path("/data/hanchong/open-source-data/Birdsnap/raw-data/images").rglob("*.jpg")
paths = sorted(paths)

split_mapping = {}
with open("/data/hanchong/open-source-data/Birdsnap/birdsnap/test_images.txt") as f:
    for line in f.readlines()[1:]:
        image_name = line.strip()
        split_mapping[image_name] = "test"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],        
        "label_name": [path.parent.name for path in paths],
    }
)

label_name_mapping = {}
for index, label_name in enumerate(sorted(df["label_name"].unique())):
    label_name_mapping[label_name] = index + 1

df["label"] = df["label_name"].apply(lambda x: label_name_mapping[x])
df['split'] = df.apply(lambda x: split_mapping.get(f"{x['label_name']}/{x['image_name']}", "train"), axis=1)

print(len(df))
df.head()

39286


Unnamed: 0,image_path,image_name,label_name,label,split
0,/data/hanchong/open-source-data/Birdsnap/raw-d...,534070.jpg,Acadian_Flycatcher,1,train
1,/data/hanchong/open-source-data/Birdsnap/raw-d...,534076.jpg,Acadian_Flycatcher,1,train
2,/data/hanchong/open-source-data/Birdsnap/raw-d...,534079.jpg,Acadian_Flycatcher,1,train
3,/data/hanchong/open-source-data/Birdsnap/raw-d...,534080.jpg,Acadian_Flycatcher,1,train
4,/data/hanchong/open-source-data/Birdsnap/raw-d...,534083.jpg,Acadian_Flycatcher,1,train


In [3]:
df["label"].nunique()

500

In [5]:
df["label"].value_counts()

label
259    97
356    95
393    94
160    94
497    94
       ..
127    45
291    44
213    41
122    38
105    33
Name: count, Length: 500, dtype: int64

In [4]:
df["split"].value_counts()

split
train    37468
test      1818
Name: count, dtype: int64

# NABirds

In [133]:
paths = Path("/data/hanchong/open-source-data/NABirds/raw-data/images").rglob("*.jpg")
paths = sorted(paths)

label_name_mapping = {}
with open("/data/hanchong/open-source-data/NABirds/raw-data/classes.txt") as f:
    for line in f.readlines():
        label = line.strip().split(" ", 1)[0]
        label_name = line[len(label) + 1 :].strip()
        label_name_mapping[int(label)] = label_name

split_mapping = {}
with open("/data/hanchong/open-source-data/NABirds/raw-data/train_test_split.txt") as f:
    for line in f.readlines():
        image_name, split = line.strip().split(" ")
        split_mapping["".join(image_name.split("-")) + ".jpg"] = {0: "test", 1: "train"}[int(split)]

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name) for path in paths],
    }
)
df["label_name"] = df["label"].apply(lambda x: label_name_mapping[x])
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

print(len(df))
df.head()

48527


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/NABirds/raw-da...,01f53d6bf5e449438d2bb79e0854bca4.jpg,295,Common Eider (Adult male),train
1,/data/hanchong/open-source-data/NABirds/raw-da...,074a068d75404dfc9e37bffc8b37265e.jpg,295,Common Eider (Adult male),test
2,/data/hanchong/open-source-data/NABirds/raw-da...,0daddfcbc9a54170ac06402bffeff37c.jpg,295,Common Eider (Adult male),train
3,/data/hanchong/open-source-data/NABirds/raw-da...,19371d9dd2874202b9c7948a5543ed2e.jpg,295,Common Eider (Adult male),test
4,/data/hanchong/open-source-data/NABirds/raw-da...,1fddd7c3b1b242eba5c020aaad4fb429.jpg,295,Common Eider (Adult male),train


In [134]:
df["label"].nunique()

555

In [135]:
df["label"].value_counts()

label
299     120
314     120
317     120
320     120
756     120
       ... 
1006     25
755      24
341      23
664      21
975      13
Name: count, Length: 555, dtype: int64

In [137]:
split_df["split"].value_counts()

split
0    24633
1    23929
Name: count, dtype: int64

In [138]:
df["split"].value_counts()

split
test     24615
train    23912
Name: count, dtype: int64

# iNat2017

In [38]:
paths = Path("/data/hanchong/open-source-data/iNat2017/raw-data/train_val_images").rglob("*.jpg")
paths = sorted(paths)

split_mapping = {}
with open("/data/hanchong/open-source-data/iNat2017/raw-data/train2017.json") as f:
    train2019 = json.load(f)
    for image in train2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "train"

with open("/data/hanchong/open-source-data/iNat2017/raw-data/val2017.json") as f:
    val2019 = json.load(f)
    for image in val2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "val"


label_name_mapping = {}
for category in train2019["categories"]:
    label_name_mapping[category["name"]] = int(category["id"])

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [label_name_mapping[path.parent.name] for path in paths],
        "label_name": [path.parent.name for path in paths],
    }
)
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

print(len(df))
df.head()

675170


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/iNat2017/raw-d...,0bb15d607734ee8ed27d8f45e88bf426.jpg,4745,Abudefduf saxatilis,val
1,/data/hanchong/open-source-data/iNat2017/raw-d...,10d4c817f42724f907bdf5f640d4d472.jpg,4745,Abudefduf saxatilis,val
2,/data/hanchong/open-source-data/iNat2017/raw-d...,19892e351c95f9ee4e25e8667fc3f7e9.jpg,4745,Abudefduf saxatilis,train
3,/data/hanchong/open-source-data/iNat2017/raw-d...,1faf0ba615708a021d080c5a8898dd8e.jpg,4745,Abudefduf saxatilis,train
4,/data/hanchong/open-source-data/iNat2017/raw-d...,206dae61fd527c7d47bf545eb1bddb36.jpg,4745,Abudefduf saxatilis,train


In [39]:
df["label"].nunique()

5089

In [40]:
df["label"].value_counts()

label
5076    3949
3752    3914
514     3712
169     3359
1161    3290
        ... 
1151      15
1588      15
1871      15
3068      14
72        14
Name: count, Length: 5089, dtype: int64

In [41]:
df["split"].value_counts()

split
train    579184
val       95986
Name: count, dtype: int64

# iNat2019

In [34]:
paths = Path("/data/hanchong/open-source-data/iNat2019/raw-data/train_val2019").rglob("*.jpg")
paths = sorted(paths)

with open("/data/hanchong/open-source-data/iNat2019/raw-data/categories.json") as f:
    categories = json.load(f)

split_mapping = {}
with open("/data/hanchong/open-source-data/iNat2019/raw-data/train2019.json") as f:
    train2019 = json.load(f)
    for image in train2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "train"

with open("/data/hanchong/open-source-data/iNat2019/raw-data/val2019.json") as f:
    val2019 = json.load(f)
    for image in val2019["images"]:
        split_mapping[image["file_name"].split("/")[-1]] = "val"

df = pd.DataFrame(
    {
        "image_path": paths,
        "image_name": [path.name for path in paths],
        "label": [int(path.parent.name) for path in paths],
    }
)
df["label_name"] = df["label"].apply(lambda x: categories[x]["name"])
df["split"] = df["image_name"].apply(lambda x: split_mapping[x])

print(len(df))
df.head()

268243


Unnamed: 0,image_path,image_name,label,label_name,split
0,/data/hanchong/open-source-data/iNat2019/raw-d...,0042d05b4ffbd5a1ce2fc56513a7777e.jpg,153,Lithobates sphenocephalus,train
1,/data/hanchong/open-source-data/iNat2019/raw-d...,006f69e838b87cfff3d12120795c4ada.jpg,153,Lithobates sphenocephalus,train
2,/data/hanchong/open-source-data/iNat2019/raw-d...,00c1bf968b20839ead054b3ab9eb1ce2.jpg,153,Lithobates sphenocephalus,train
3,/data/hanchong/open-source-data/iNat2019/raw-d...,011ae401924d635371dc70e059b9748b.jpg,153,Lithobates sphenocephalus,train
4,/data/hanchong/open-source-data/iNat2019/raw-d...,013862c72d6dc1344892e96af1130d76.jpg,153,Lithobates sphenocephalus,train


In [35]:
df["label"].nunique()

1010

In [36]:
df["label"].value_counts()

label
155    503
201    503
153    503
156    503
194    503
      ... 
813     29
612     27
553     26
899     25
689     19
Name: count, Length: 1010, dtype: int64

In [37]:
df["split"].value_counts()

split
train    265213
val        3030
Name: count, dtype: int64