In [32]:
import scipy.io
import numpy as np
import os
import pandas as pd
# np.set_printoptions(threshold=np.inf)

DATA_PATH_PREFIX = "../datasets/"

In [38]:
# load the flowers dataset
labels = scipy.io.loadmat(os.path.join(DATA_PATH_PREFIX, 'flowers/imagelabels.mat'))["labels"].flatten()
split = scipy.io.loadmat(os.path.join(DATA_PATH_PREFIX, 'flowers/setid.mat'))

# make the splits - each of these is an array (for some reason, the test and train labels have been split)
train_split = split["tstid"].flatten()
test_split = split["trnid"].flatten()
val_split = split["valid"].flatten()

print(train_split.shape)
print(test_split.shape)
print(val_split.shape)

def prepare_df(split):
    """
    split: A numpy array containing a list of image ids for this split
    
    Returns: A Pandas DataFrame containing 2 columns: img_name, which contains the image name in the form image_xxxxx.jpg, and a label signifying which flower it is.
    """
    
    def image_name(n):
        s = str(n)
        return "0" * (5 - len(s)) + s + ".jpg"
    
    df = pd.DataFrame(columns=["img_name", "label"])
    df["img_name"] = np.array([image_name(n) for n in split])
    df["label"] = np.array([labels[n - 1] for n in split])
    return df

train_df = prepare_df(train_split)
test_df = prepare_df(test_split)
val_df = prepare_df(val_split)

train_df.to_csv(os.path.join(DATA_PATH_PREFIX, 'flowers/train_csv.csv'), index = False, header=True)
test_df.to_csv(os.path.join(DATA_PATH_PREFIX, 'flowers/test_csv.csv'), index = False, header=True)
val_df.to_csv(os.path.join(DATA_PATH_PREFIX, 'flowers/val_csv.csv'), index = False, header=True)

(6149,)
(1020,)
(1020,)
