In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import scipy
import numpy as np
import random

Output CUB split jason files

In [16]:
# Define paths
dataset_root = '/storage/data/changyu/CUB_200_2011'
images_path = os.path.join(dataset_root, 'images.txt')
splits_path = os.path.join(dataset_root, 'train_test_split.txt')
image_class_labels_path = os.path.join(dataset_root, 'image_class_labels.txt')


In [17]:
 # Load dataset information
splits = pd.read_csv(splits_path, sep=' ', names=['img_id', 'is_train'])
labels = pd.read_csv(image_class_labels_path, sep=' ', names=['img_id', 'label'])
images = pd.read_csv(images_path, sep=' ', names=['img_id', 'image_name'])

# Merge splits and labels
data_info = pd.merge(splits, labels, on='img_id')
data_info = pd.merge(data_info, images, on='img_id')

In [19]:
# Train-test split provided by the dataset
train_data = data_info[data_info['is_train'] == 1]
test_data = data_info[data_info['is_train'] == 0]

In [20]:
# Split the original training data to create a validation set
train_data, val_data = train_test_split(train_data, test_size=0.10, random_state=42, stratify=train_data['label'])


In [47]:
#save data to json
train_json, val_json, test_json = {},{},{}
for i in range(train_data.shape[0]):
    img_path = train_data.iloc[i,3]
    train_json[img_path] = int(train_data.iloc[i,2])

for i in range(val_data.shape[0]):
    img_path = val_data.iloc[i,3]
    val_json[img_path] = int(val_data.iloc[i,2])

for i in range(test_data.shape[0]):
    img_path = test_data.iloc[i,3]
    test_json[img_path] = int(test_data.iloc[i,2])


In [48]:
# Save json files
json_file_path = os.path.join(dataset_root, 'train.json')
with open(json_file_path, 'w') as f:
    json.dump(train_json, f, indent=4)

json_file_path = os.path.join(dataset_root, 'val.json')
with open(json_file_path, 'w') as f:
    json.dump(val_json, f, indent=4)

json_file_path = os.path.join(dataset_root, 'test.json')
with open(json_file_path, 'w') as f:
    json.dump(test_json, f, indent=4)

In [49]:
train_data["label"].value_counts()

label
103    27
191    27
55     27
45     27
10     27
       ..
107    26
196    26
190    26
141    26
126    26
Name: count, Length: 200, dtype: int64

In [50]:
val_data["label"].value_counts()

label
32     3
143    3
177    3
130    3
86     3
      ..
146    3
43     3
162    3
124    3
88     3
Name: count, Length: 200, dtype: int64

In [54]:
test_data["label"].value_counts()

label
1      30
143    30
120    30
121    30
122    30
       ..
105    19
8      18
18     15
5      14
6      11
Name: count, Length: 200, dtype: int64

Output Flower split jason files

In [47]:
dataset_root_flower = '/storage/data/changyu/OxfordFlowers'
split = scipy.io.loadmat('/storage/data/changyu/OxfordFlowers/setid.mat')          #train and val is 50% and 50%, we will resplit it to 90% and 10%
labels = scipy.io.loadmat('/storage/data/changyu/OxfordFlowers/imagelabels.mat')
labels = labels['labels'].reshape(-1)
train_idx = split['trnid'].reshape(-1) -1
val_idx = split['valid'].reshape(-1) -1
test_idx = split['tstid'].reshape(-1) -1

In [42]:
#get all the image relative paths under /storage/data/changyu/Flower
image_paths = []
for root, dirs, files in os.walk(dataset_root_flower):   
    for file in files:
        if file.endswith('.jpg'):
            image_paths.append(os.path.relpath(os.path.join(root, file), dataset_root_flower))
image_paths = sorted(image_paths)

In [43]:
#merge train and val, then split it to 90% and 10%
train_val_idx = np.concatenate((train_idx, val_idx))
train_idx_new, val_idx_new = train_test_split(train_val_idx, test_size=0.10, random_state=42, stratify=labels[train_val_idx])


In [44]:
#form the split json files with the format of {image_path: label}
train_json, val_json, test_json = {},{},{}
for i in train_idx_new:
    img_path = image_paths[i]
    train_json[img_path] = int(labels[i])

for i in val_idx_new:
    img_path = image_paths[i]
    val_json[img_path] = int(labels[i])

for i in test_idx:
    img_path = image_paths[i]
    test_json[img_path] = int(labels[i])    

# Save json files
with open(os.path.join(dataset_root_flower, 'train.json'), 'w') as f:
    json.dump(train_json, f, indent=4)

with open(os.path.join(dataset_root_flower, 'val.json'), 'w') as f:
    json.dump(val_json, f, indent=4)

with open(os.path.join(dataset_root_flower, 'test.json'), 'w') as f:
    json.dump(test_json, f, indent=4)


Get Aircraft split jason files

In [18]:
# read the variants.txt file
variants = pd.read_csv('/storage/data/changyu/fgvc-aircraft-2013b/data/variants.txt', header=None, names=['Variant'])

# Reset the index so it becomes a column in the DataFrame
variants.reset_index(inplace=True)
variants.rename(columns={'index': 'Variant_Index'}, inplace=True)
variants['Variant_Index'] = variants['Variant_Index'] + 1
print(variants.head())


   Variant_Index  Variant
0              1  707-320
1              2  727-200
2              3  737-200
3              4  737-300
4              5  737-400


In [23]:
# Load the image to variant mappings
images_variant_test = pd.read_csv('/storage/data/changyu/fgvc-aircraft-2013b/data/images_variant_test.txt', header=None, names=['Combined'])
images_variant_trainval = pd.read_csv('/storage/data/changyu/fgvc-aircraft-2013b/data/images_variant_trainval.txt', header=None, names=['Combined'])

# Split the 'Combined' column into two columns at the first space
images_variant_test[['Filename', 'Variant']] = images_variant_test['Combined'].str.split(n=1, expand=True)
images_variant_trainval[['Filename', 'Variant']] = images_variant_trainval['Combined'].str.split(n=1, expand=True)

merged_test = pd.merge(images_variant_test, variants, on='Variant', how='left')
merged_trainval = pd.merge(images_variant_trainval, variants, on='Variant', how='left')

#use Filename to get the relative path
merged_test['re_path'] = 'data/images/' + merged_test['Filename'] + '.jpg'
merged_trainval['re_path'] = 'data/images/' + merged_trainval['Filename'] + '.jpg'

print(merged_test.head())
print(merged_trainval.head())

          Combined Filename  Variant  Variant_Index                  re_path
0  1514522 707-320  1514522  707-320              1  data/images/1514522.jpg
1  0747566 707-320  0747566  707-320              1  data/images/0747566.jpg
2  1008575 707-320  1008575  707-320              1  data/images/1008575.jpg
3  0717480 707-320  0717480  707-320              1  data/images/0717480.jpg
4  0991569 707-320  0991569  707-320              1  data/images/0991569.jpg
          Combined Filename  Variant  Variant_Index                  re_path
0  1025794 707-320  1025794  707-320              1  data/images/1025794.jpg
1  1340192 707-320  1340192  707-320              1  data/images/1340192.jpg
2  0056978 707-320  0056978  707-320              1  data/images/0056978.jpg
3  0698580 707-320  0698580  707-320              1  data/images/0698580.jpg
4  0450014 707-320  0450014  707-320              1  data/images/0450014.jpg


In [26]:
#Split the trainval to train and val with 90% and 10%
trainval_idx = merged_trainval.index
train_idx_new, val_idx_new = train_test_split(trainval_idx, test_size=0.10, random_state=42, stratify=merged_trainval['Variant_Index'])

In [48]:
# save the json files
train_json, val_json, test_json = {},{},{}
for i in train_idx_new:
    img_path = merged_trainval.iloc[i,4]
    train_json[img_path] = int(merged_trainval.iloc[i,3])

for i in val_idx_new:
    img_path = merged_trainval.iloc[i,4]
    val_json[img_path] = int(merged_trainval.iloc[i,3])

for i in range(merged_test.shape[0]):
    img_path = merged_test.iloc[i,4]
    test_json[img_path] = int(merged_test.iloc[i,3])

# Save json files
with open('/storage/data/changyu/fgvc-aircraft-2013b/train.json', 'w') as f:
    json.dump(train_json, f, indent=4)

with open('/storage/data/changyu/fgvc-aircraft-2013b/val.json', 'w') as f:
    json.dump(val_json, f, indent=4)

with open('/storage/data/changyu/fgvc-aircraft-2013b/test.json', 'w') as f:
    json.dump(test_json, f, indent=4)


In [None]:
# the ratio of the number of images in each class to the total number of images
print(merged_trainval.iloc[train_idx_new]['Variant_Index'].value_counts())
print(merged_trainval.iloc[val_idx_new]['Variant_Index'].value_counts())
print(merged_test['Variant_Index'].value_counts())


Get stanford cars split jason files


download dataset according to https://github.com/pytorch/vision/issues/7545

In [74]:
test_data = scipy.io.loadmat('/storage/data/changyu/StanfordCars/cars_test_annos_withlabels (1).mat')
train_data = scipy.io.loadmat('/storage/data/changyu/StanfordCars/devkit/cars_train_annos.mat')

In [55]:
test_data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Dec 14 14:13:07 2013',
 '__version__': '1.0',
 '__globals__': [],
 'annotations': array([[(array([[30]], dtype=uint8), array([[52]], dtype=uint8), array([[246]], dtype=uint8), array([[147]], dtype=uint8), array([[181]], dtype=uint8), array(['00001.jpg'], dtype='<U9')),
         (array([[100]], dtype=uint8), array([[19]], dtype=uint8), array([[576]], dtype=uint16), array([[203]], dtype=uint8), array([[103]], dtype=uint8), array(['00002.jpg'], dtype='<U9')),
         (array([[51]], dtype=uint8), array([[105]], dtype=uint8), array([[968]], dtype=uint16), array([[659]], dtype=uint16), array([[145]], dtype=uint8), array(['00003.jpg'], dtype='<U9')),
         ...,
         (array([[33]], dtype=uint8), array([[27]], dtype=uint8), array([[602]], dtype=uint16), array([[252]], dtype=uint8), array([[17]], dtype=uint8), array(['08039.jpg'], dtype='<U9')),
         (array([[33]], dtype=uint8), array([[142]], dtype=uint8), arra

In [75]:
train_data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Dec 14 14:13:07 2013',
 '__version__': '1.0',
 '__globals__': [],
 'annotations': array([[(array([[39]], dtype=uint8), array([[116]], dtype=uint8), array([[569]], dtype=uint16), array([[375]], dtype=uint16), array([[14]], dtype=uint8), array(['00001.jpg'], dtype='<U9')),
         (array([[36]], dtype=uint8), array([[116]], dtype=uint8), array([[868]], dtype=uint16), array([[587]], dtype=uint16), array([[3]], dtype=uint8), array(['00002.jpg'], dtype='<U9')),
         (array([[85]], dtype=uint8), array([[109]], dtype=uint8), array([[601]], dtype=uint16), array([[381]], dtype=uint16), array([[91]], dtype=uint8), array(['00003.jpg'], dtype='<U9')),
         ...,
         (array([[26]], dtype=uint8), array([[246]], dtype=uint8), array([[660]], dtype=uint16), array([[449]], dtype=uint16), array([[163]], dtype=uint8), array(['08142.jpg'], dtype='<U9')),
         (array([[78]], dtype=uint8), array([[526]], dtype=uint16), 

Get stanford dogs split jason files

In [2]:
test_data = scipy.io.loadmat("/storage/data/changyu/StanfordDogs/test_list.mat")
train_data = scipy.io.loadmat("/storage/data/changyu/StanfordDogs/train_list.mat")
file_list = scipy.io.loadmat("/storage/data/changyu/StanfordDogs/file_list.mat")

In [11]:
#extract test data to a jason with image_path and label
# test_json = {}
# for i in range(len(test_data['file_list'])):
#     img_path = test_data['file_list'][i][0][0]
#     label = test_data['labels'][i][0]
#     test_json[img_path] = int(label)


1


In [12]:
#extract train data to a dataframe with image_path and label columns
train_df = pd.DataFrame(columns=['image_path', 'label'])
for i in range(len(train_data['file_list'])):
    img_path = train_data['file_list'][i][0][0]
    label = int(train_data['labels'][i][0])
    train_df.loc[i, 'image_path'] = img_path
    train_df.loc[i, 'label'] = label

test_df = pd.DataFrame(columns=['image_path', 'label'])
for i in range(len(test_data['file_list'])):
    img_path = test_data['file_list'][i][0][0]
    label = int(test_data['labels'][i][0])
    test_df.loc[i, 'image_path'] = img_path
    test_df.loc[i, 'label'] = label

In [13]:
#split the train data to train and val with 90% and 10%
train_df_new, val_df_new = train_test_split(train_df, test_size=0.10, random_state=42, stratify=train_df['label'])

#save the json files
train_json, val_json, test_json = {},{},{}
for i in range(train_df_new.shape[0]):
    img_path = train_df_new.iloc[i,0]
    label = train_df_new.iloc[i,1]
    train_json[img_path] = label

for i in range(val_df_new.shape[0]):
    img_path = val_df_new.iloc[i,0]
    label = val_df_new.iloc[i,1]
    val_json[img_path] = label

for i in range(test_df.shape[0]):
    img_path = test_df.iloc[i,0]
    label = test_df.iloc[i,1]
    test_json[img_path] = label


In [14]:
# Save json files
with open('/storage/data/changyu/StanfordDogs/train.json', 'w') as f:
    json.dump(train_json, f, indent=4)

with open('/storage/data/changyu/StanfordDogs/val.json', 'w') as f:
    json.dump(val_json, f, indent=4)

with open('/storage/data/changyu/StanfordDogs/test.json', 'w') as f:
    json.dump(test_json, f, indent=4)

In [48]:
print(test_df['label'].value_counts())

label
3      152
10     139
27     132
108    119
89     118
      ... 
71      50
101     50
83      50
4       49
18      48
Name: count, Length: 120, dtype: int64


In [142]:
# count the number of images in each class to the total number of images
print(train_df_new['label'].value_counts())
print(val_df_new['label'].value_counts())

label
56     90
17     90
68     90
58     90
101    90
       ..
5      90
16     90
52     90
38     90
80     90
Name: count, Length: 120, dtype: int64
label
82     10
87     10
68     10
116    10
20     10
       ..
32     10
1      10
15     10
45     10
56     10
Name: count, Length: 120, dtype: int64


Get nabirds split jason files

In [40]:
#read files to dataframe
image_class_labels = pd.read_csv('/storage/data/changyu/nabirds/image_class_labels.txt', sep=' ', names=['img_id', 'label'])
images = pd.read_csv('/storage/data/changyu/nabirds/images.txt', sep=' ', names=['img_id', 'image_re_path'])
splits = pd.read_csv('/storage/data/changyu/nabirds/train_test_split.txt', sep=' ', names=['img_id', 'is_train'])

#merge the dataframes
data_info = pd.merge(image_class_labels, images, on='img_id')
data_info = pd.merge(data_info, splits, on='img_id') 

#split train and test data
train_data = data_info[data_info['is_train'] == 1]
test_data = data_info[data_info['is_train'] == 0]

#split the train data to train and val with 90% and 10%
train_data_new, val_data_new = train_test_split(train_data, test_size=0.10, random_state=42, stratify=train_data['label'])


In [41]:
#!!!!!!!!Beacause the this split makes class 627 and 975 without any sample in the val set, 
#!!!!!!!!we add one sample from train set to val set.
#because the logic of json_dataset class, we make this modification
#add one sample from class 627 and 975 to val set
val_data_new = pd.concat([val_data_new, train_data_new[train_data_new['label']==627].iloc[[0]]], ignore_index=True)
val_data_new = pd.concat([val_data_new, train_data_new[train_data_new['label']==975].iloc[[0]]], ignore_index=True)
train_data_new = train_data_new.drop(train_data_new[train_data_new['label']==627].index[0])
train_data_new = train_data_new.drop(train_data_new[train_data_new['label']==975].index[0])



In [42]:
#save the json files
train_json, val_json, test_json = {},{},{}
for i in range(train_data_new.shape[0]):
    img_path = train_data_new.iloc[i,2]
    label = train_data_new.iloc[i,1]
    train_json[img_path] = int(label)

for i in range(val_data_new.shape[0]):
    img_path = val_data_new.iloc[i,2]
    label = val_data_new.iloc[i,1]
    val_json[img_path] = int(label)

for i in range(test_data.shape[0]):
    img_path = test_data.iloc[i,2]
    label = test_data.iloc[i,1]
    test_json[img_path] = int(label)

# Save json files
with open('/storage/data/changyu/nabirds/train.json', 'w') as f:
    json.dump(train_json, f, indent=4)

with open('/storage/data/changyu/nabirds/val.json', 'w') as f:
    json.dump(val_json, f, indent=4)

with open('/storage/data/changyu/nabirds/test.json', 'w') as f:
    json.dump(test_json, f, indent=4)


In [43]:
#count the number of images in each class to the total number of images
#print(train_data_new['label'].value_counts())
#print(val_data_new['label'].value_counts())
#print(test_data['label'].value_counts())

#save counts reults into dataframes with label as one column and counts as another column
train_counts = train_data_new['label'].value_counts().reset_index()
train_counts.columns = ['label', 'counts']
val_counts = val_data_new['label'].value_counts().reset_index()
val_counts.columns = ['label', 'counts']
test_counts = test_data['label'].value_counts().reset_index()
test_counts.columns = ['label', 'counts']


#merge the counts dataframes
merged_df = pd.merge(train_counts, val_counts, on='label', how='outer')
merged_df = pd.merge(merged_df, test_counts, on='label', how='outer')


In [44]:
#save the merged dataframe to a csv file
merged_df.to_csv('/storage/data/changyu/nabirds/label_counts.csv', index=False)

In [45]:
train_counts

Unnamed: 0,label,counts
0,979,54
1,935,54
2,400,54
3,320,54
4,778,54
...,...,...
550,807,7
551,633,6
552,815,5
553,627,4


In [46]:
val_counts

Unnamed: 0,label,counts
0,856,6
1,823,6
2,924,6
3,352,6
4,935,6
...,...,...
550,618,1
551,369,1
552,664,1
553,759,1


In [47]:
test_counts

Unnamed: 0,label,counts
0,657,60
1,922,60
2,856,60
3,655,60
4,823,60
...,...,...
550,755,12
551,599,12
552,464,12
553,664,11


Imagenet

In [None]:
import os
import tarfile
import argparse

def extract_tar(tar_path, extract_dir):
    """Extracts a tar file to the specified directory."""
    if not tarfile.is_tarfile(tar_path):
        raise ValueError(f"{tar_path} is not a valid tar file.")
    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=extract_dir)
    print(f"Extracted {tar_path} to {extract_dir}")

def extract_imagenet_train(main_tar, output_dir, remove_subtars=True):
    # Ensure the output directory exists.
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Extract the main tar file.
    if os.path.exists(main_tar):
        print("Extracting main tar file...")
        extract_tar(main_tar, output_dir)

    # Step 2: For each class-specific tar file in the output directory, extract it.
    print("Extracting class-specific tar files...")
    for filename in os.listdir(output_dir):
        if filename.endswith(".tar"):
            tar_path = os.path.join(output_dir, filename)
            # Create a folder with the same name as the tar file (minus the .tar extension)
            class_dir = os.path.join(output_dir, filename[:-4])
            os.makedirs(class_dir, exist_ok=True)
            extract_tar(tar_path, class_dir)
            if remove_subtars:
                os.remove(tar_path)
                print(f"Removed {tar_path}")

In [None]:
#unzip the imagenet train data
extract_imagenet_train('/storage/data/changyu/imagenet/ILSVRC2012_img_train.tar', '/storage/data/changyu/imagenet/train')

In [104]:
# Set the paths to your validation images and the ground-truth label file.
val_dir = '/scratch/data/imagenet/val'
label_file = '/scratch/data/imagenet/ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt'

# Read the ground truth labels; each line corresponds to an image.
with open(label_file, 'r') as f:
    labels = [line.strip() for line in f.readlines()]

val_images = sorted(os.listdir(val_dir))

In [105]:
# Build the mapping dictionary "path": label.
mapping = {}
for i, img_file in enumerate(val_images):
    # Full path to the image file
    img_path = os.path.join("val", img_file)
    
    # It's good to check that we have a label for each image.
    if i < len(labels):
        mapping[img_path] = int(labels[i])
    else:
        print(f"Warning: No label found for {img_file}")

# Write the mapping to a JSON file.
output_file = '/scratch/data/imagenet/val.json'
with open(output_file, 'w') as json_file:
    json.dump(mapping, json_file, indent=4)

print(f"JSON mapping file created successfully and saved as '{output_file}'.")

JSON mapping file created successfully and saved as '/scratch/data/imagenet/val.json'.


In [106]:
#Load the meta.mat file.
meta = scipy.io.loadmat("/scratch/data/imagenet/ILSVRC2012_devkit_t12/data/meta.mat")
synsets = meta['synsets']


In [107]:
#Build a mapping from wnid to a 0-indexed label (0 to 999)
wnid_to_label = {}
for idx, syn in enumerate(synsets):
    # Assuming the structure is like: [ 'ILSVRC2012_ID', 'WNID', 'words', ... ]
    # and that WNID is at index 1:
    wnid = syn[0][1][0]  # adjust index if needed based on meta.mat structure
    wnid_to_label[wnid] = syn[0][0][0][0]

In [133]:
wnid_to_label["n02092339"]

25

In [134]:
synsets[24]

array([(array([[25]], dtype=uint8), array(['n02092339'], dtype='<U9'), array(['Weimaraner'], dtype='<U10'), array(['large breed of hound having a smooth greyish coat; originated in Germany'],
             dtype='<U72'), array([[0]], dtype=uint8), array([], shape=(1, 0), dtype=uint8), array([[0]], dtype=uint8), array([[1300]], dtype=uint16))                                         ],
      dtype=[('ILSVRC2012_ID', 'O'), ('WNID', 'O'), ('words', 'O'), ('gloss', 'O'), ('num_children', 'O'), ('children', 'O'), ('wordnet_height', 'O'), ('num_train_images', 'O')])

In [None]:

# Assuming your training images are stored in a directory where each subfolder is named by the wnid.
dataset_dir = "/scratch/data/imagenet/train"  # adjust this path as needed
img_label_mapping = {}

for wnid in os.listdir('/scratch/data/imagenet/train'):

    folder_path = os.path.join(dataset_dir, wnid)

    if not os.path.isdir(folder_path):
        continue
    label = wnid_to_label.get(wnid)
    # Only proceed if the wnid is in our mapping.
    if label is None:
        continue
    # Loop through each image in the subfolder.
    for img_file in os.listdir(folder_path):
        # Construct the full path for the image.
        img_path = os.path.join("train", wnid, img_file)
        img_label_mapping[img_path] = int(label)

# Step 4: Save the mapping as a JSON file.
with open('/scratch/data/imagenet/train.json', 'w') as f:
    json.dump(img_label_mapping, f, indent=4)

print("JSON file '/scratch/data/imagenet/train.json' generated successfully.")


n02114712
/scratch/data/imagenet/train/n02114712
n02092339
/scratch/data/imagenet/train/n02092339
n03791053
/scratch/data/imagenet/train/n03791053
n02128385
/scratch/data/imagenet/train/n02128385
n04141076
/scratch/data/imagenet/train/n04141076
n03866082
/scratch/data/imagenet/train/n03866082
n01847000
/scratch/data/imagenet/train/n01847000
n04525305
/scratch/data/imagenet/train/n04525305
n01773797
/scratch/data/imagenet/train/n01773797
n02108000
/scratch/data/imagenet/train/n02108000
n04557648
/scratch/data/imagenet/train/n04557648
n10565667
/scratch/data/imagenet/train/n10565667
n04228054
/scratch/data/imagenet/train/n04228054
n03775071
/scratch/data/imagenet/train/n03775071
n02113186
/scratch/data/imagenet/train/n02113186
n02137549
/scratch/data/imagenet/train/n02137549
n03110669
/scratch/data/imagenet/train/n03110669
n02098413
/scratch/data/imagenet/train/n02098413
n02950826
/scratch/data/imagenet/train/n02950826
n02486261
/scratch/data/imagenet/train/n02486261
n04118538
/scratch/d

In [88]:
img_label_mapping

{'train/n02114712/n02114712_16558.JPEG': 439,
 'train/n02114712/n02114712_18198.JPEG': 439,
 'train/n02114712/n02114712_23650.JPEG': 439,
 'train/n02114712/n02114712_21167.JPEG': 439,
 'train/n02114712/n02114712_23819.JPEG': 439,
 'train/n02114712/n02114712_4755.JPEG': 439,
 'train/n02114712/n02114712_11148.JPEG': 439,
 'train/n02114712/n02114712_20341.JPEG': 439,
 'train/n02114712/n02114712_23078.JPEG': 439,
 'train/n02114712/n02114712_22556.JPEG': 439,
 'train/n02114712/n02114712_9617.JPEG': 439,
 'train/n02114712/n02114712_837.JPEG': 439,
 'train/n02114712/n02114712_584.JPEG': 439,
 'train/n02114712/n02114712_15938.JPEG': 439,
 'train/n02114712/n02114712_17588.JPEG': 439,
 'train/n02114712/n02114712_17039.JPEG': 439,
 'train/n02114712/n02114712_2047.JPEG': 439,
 'train/n02114712/n02114712_16891.JPEG': 439,
 'train/n02114712/n02114712_18999.JPEG': 439,
 'train/n02114712/n02114712_1263.JPEG': 439,
 'train/n02114712/n02114712_4248.JPEG': 439,
 'train/n02114712/n02114712_16240.JPEG': 43

In [90]:
synsets[438]

array([(array([[439]], dtype=uint16), array(['n02114712'], dtype='<U9'), array(['red wolf, maned wolf, Canis rufus, Canis niger'], dtype='<U46'), array(['reddish-grey wolf of southwestern North America'], dtype='<U47'), array([[0]], dtype=uint8), array([], shape=(1, 0), dtype=uint8), array([[0]], dtype=uint8), array([[1156]], dtype=uint16))],
      dtype=[('ILSVRC2010_ID', 'O'), ('WNID', 'O'), ('words', 'O'), ('gloss', 'O'), ('num_children', 'O'), ('children', 'O'), ('wordnet_height', 'O'), ('num_train_images', 'O')])

In [2]:
#extract the json file containing 1% label
with open('/storage/data/changyu/imagenet/train.json', 'r') as f:
    img_label_mapping = json.load(f)





In [4]:
#Only keep 1% of the labels in img_label_mapping 
new_img_label_mapping = {}
for key in img_label_mapping:
    if random.random() < 0.01:
        new_img_label_mapping[key] = img_label_mapping[key]

#save the new json file
with open('/storage/data/changyu/imagenet/train_1percent.json', 'w') as f:
    json.dump(new_img_label_mapping, f, indent=4)



In [5]:
#Only keep 3% of the labels in img_label_mapping 
new_img_label_mapping = {}
for key in img_label_mapping:
    if random.random() < 0.03:
        new_img_label_mapping[key] = img_label_mapping[key]

#save the new json file
with open('/storage/data/changyu/imagenet/train_3percent.json', 'w') as f:
    json.dump(new_img_label_mapping, f, indent=4)

In [6]:
#Only keep 5% of the labels in img_label_mapping 
new_img_label_mapping = {}
for key in img_label_mapping:
    if random.random() < 0.05:
        new_img_label_mapping[key] = img_label_mapping[key]

#save the new json file
with open('/storage/data/changyu/imagenet/train_5percent.json', 'w') as f:
    json.dump(new_img_label_mapping, f, indent=4)

In [7]:
#Only keep 10% of the labels in img_label_mapping 
new_img_label_mapping = {}
for key in img_label_mapping:
    if random.random() < 0.1:
        new_img_label_mapping[key] = img_label_mapping[key]

#save the new json file
with open('/storage/data/changyu/imagenet/train_10percent.json', 'w') as f:
    json.dump(new_img_label_mapping, f, indent=4)