In [1]:
from scipy import io
import numpy as np
import pandas as pd
import sys
from os import path, mkdir, rename
try:
    import xmltodict
except ImportError:
    print("Installing a module to convert XML into Python dictionaries, hold on...")
    !conda install --yes --prefix {sys.prefix} xmltodict  # I really hope you're running conda
    import xmltodict

from IPython.display import display, clear_output
from PIL import Image  # for working with images
from random import random, seed, choices
# Stanford files:
annotation_path = "annotations"
image_path_original = "images"
lists_path = "lists"

# Our files:
image_path_sorted = "images_sorted"
image_path_train = path.join(image_path_sorted, "train")
image_path_dev = path.join(image_path_sorted, "dev")
image_path_test = path.join(image_path_sorted, "test")

if not path.exists(image_path_sorted):
    print("Creating missing directories... ", end='', flush=True)
    mkdir(image_path_sorted)
    mkdir(image_path_train)
    mkdir(image_path_dev)
    mkdir(image_path_test)
    print("Done")


# fractions (sum=1):
frac_train = 0.8
frac_dev = 0.1
frac_test = 0.1

    
file_list = io.loadmat(path.join(lists_path, "file_list.mat"))
train_list = io.loadmat(path.join(lists_path, "train_list.mat"))
test_list = io.loadmat(path.join(lists_path, "test_list.mat"))

# Random seed - leave unchanged for consistency between runs
seed(343)

## Label Translation Table

Noting that the `file_list.mat` file contains numerical labels, meanwhile the XML metadata/annotation files contain written text labels, we'll build a "translation table" of text labels $\rightarrow$ numerical labels and vice versa, which will come in useful later.

Critical assumption: given 1 label is given to all files, files with multiple dogs must all the same breed. 

We'll also tally the total number of images for convenience later.

In [2]:
translation_table_txt2num = dict()
total_images = 0
for annotation, label in zip(file_list["annotation_list"], file_list["labels"]):
    total_images += 1
    with open(path.join(annotation_path, annotation[0][0]), 'rb') as f:
        meta_dict = xmltodict.parse(f.read())["annotation"]
        if isinstance(meta_dict["object"], list):
            translation_table_txt2num[meta_dict["object"][0]["name"]] = label[0]
        else:
            translation_table_txt2num[meta_dict["object"]["name"]] = label[0]
df = pd.DataFrame.from_dict(translation_table_txt2num, orient='index', columns=["Count"]).reset_index()
df.columns = ["Breed", "Label"]
display(df)
df.to_csv("label_translation_table_txt2num.csv")

translation_table_num2txt = {v: k for k, v in translation_table_txt2num.items()}  # inverted trans table

Unnamed: 0,Breed,Label
0,Chihuahua,1
1,Japanese_spaniel,2
2,Maltese_dog,3
3,Pekinese,4
4,Shih-Tzu,5
...,...,...
115,standard_poodle,116
116,Mexican_hairless,117
117,dingo,118
118,dhole,119


## Test Splits and Bounding Boxes 

Although the Stanford dataset did come ready with test/train splits, they were around 58:42, far from the 80:10:10 decided on by the group members. As such I'll use random numbers with a constant seed to generate my own splits in a reproducable manner. 

- Problem: I currently pull file names from the XML data however randomly some simply have "%s" as a file name. e.g. English Foxhound `n02089973_382`

Solution: ?


- ~~Problem~~: For whatever reason, despite all images *supposedly* being JPGs with a depth of 3, I happened across an error where PIL reported an image had an Alpha channel (RGBA) which cannot be saved as a JPEG/JPG.

Solution: `im.mode` and `im.convert`, also saving as PNG to avoid lossy compression.

In [3]:
def bbox_cut(im, bndbox):
    """Takes a PIL image and crops out to the supplied coordinates. Returns cropped image object."""
    # As Image.crop works from the top left corner we need to do our y coordinates upsidedown. 
    box = (int(bndbox["xmin"]), int(bndbox["ymin"]), int(bndbox["xmax"]), int(bndbox["ymax"]))
    im_crop = im.crop(box)
    return im_crop

pop = [image_path_train, image_path_dev, image_path_test]
prob = [frac_train, frac_dev, frac_test]
def split_loc():
    """Randomly chooses which category to put the image in. Returns directory path."""
    res = choices(population=pop, weights=prob, k=1)
    return res[0]

In [4]:
# Switch to True to see the resulting images as they're processed. 
visualise = False

# Print occasional % updates
prog_track = True
processed = 0
freq_track = 100
state = freq_track

In [5]:
for image, annotation, label in zip(file_list["file_list"], file_list["annotation_list"], file_list["labels"]):
    if visualise:
        clear_output(wait=True)
    if prog_track == True:
        if state >= freq_track:
            clear_output(wait=True)
            state = 0
            print(f"Done {processed} | {100*(processed/total_images):.3f}%")
        processed+=1
        state +=1
    # load image with PIL:
    im = Image.open(path.join(image_path_original, image[0][0]))
    # Ensure image is RGB (e.g. not RGBA or whatever...)
    if im.mode is not "RGB":
        im = im.convert("RGB")
    if visualise:
        display(im)
    #open annotation file and get bounding box data:
    with open(path.join(annotation_path, annotation[0][0]), 'rb') as f:
        meta_dict = xmltodict.parse(f.read())["annotation"]
        # Check if there's multiple dogs listed to be in the image:
        if isinstance(meta_dict["object"], list):
            # Where there's more than one:
            for i, obj_dict in enumerate(meta_dict["object"]): 
                # get our cropped image and save it as `filename-cropnumber-label.png`
                im_cut = bbox_cut(im, obj_dict["bndbox"])
                if visualise:
                    display(im_cut)
                im_cut.save(path.join(split_loc(), f"{meta_dict['filename']}-{i}-{translation_table_txt2num[obj_dict['name']]}.png"))
                
        else:  # where there's only one dog:
            obj_dict = meta_dict["object"]
            im_cut = bbox_cut(im, obj_dict["bndbox"])
            if visualise:
                display(im_cut)
            im_cut.save(path.join(split_loc(), f"{meta_dict['filename']}-{0}-{translation_table_txt2num[obj_dict['name']]}.png"))

if prog_track == True:
    clear_output(wait=True)
    print(f"Done {processed} | {100*(processed/total_images):.3f}%")                                  

Done 20500 | 99.61127%


## TODO:
- ~~cut out bounding boxes~~
- ~~Split into train, dev, and test sets~~
- merge in the renaming scheme from other group members, or propose a new one