In [2]:
import numpy as np
import pandas as pd
import os
import re

# Annotations

In [3]:
list_images = []

with open('annotations_aug/images_list_orig.txt', 'r') as f:
    for line in f:
        name = line[:-1]
        list_images.append(name)
        
print(list_images[:10])
print()
print(len(list_images))

['saint_bernard_188.jpg', 'Ragdoll_164.jpg', 'chihuahua_75.jpg', 'american_bulldog_60.jpg', 'Siamese_9.jpg', 'saint_bernard_122.jpg', 'Bombay_95.jpg', 'Bengal_167.jpg', 'Birman_95.jpg', 'havanese_147.jpg']

7390


In [4]:
path_to_list = 'annotations/list.txt'

annotations_df = pd.read_csv(path_to_list, sep=" ", header=None, skiprows=6, names=["File name", "Class id", "Species id", "Breed id"])

display(annotations_df)

Unnamed: 0,File name,Class id,Species id,Breed id
0,Abyssinian_100,1,1,1
1,Abyssinian_101,1,1,1
2,Abyssinian_102,1,1,1
3,Abyssinian_103,1,1,1
4,Abyssinian_104,1,1,1
...,...,...,...,...
7344,yorkshire_terrier_96,37,2,25
7345,yorkshire_terrier_97,37,2,25
7346,yorkshire_terrier_98,37,2,25
7347,yorkshire_terrier_99,37,2,25


The list of annotations above is missing some images as there are 7390 images in total in the images folder. So we'll read the filenames from the images directory and make a more complete annotations_df. But first we want a dictionary relating the species and breed name to the (class id,species id,breed id).

First we define a function that will take the file name and output the corresponding Species:breed. The file names are of the following format: breed_number.ext with the first letter being in upper case for cats and lower case for dogs. For example, the file name of an image of an Abyssinian cat is like 'Abyssinian_34.jpg' (or .png or .gif) and we want the output to be 'Cat: abyssinian'. Similarly the file name for an image of a Beagle dog is like 'beagle_26.jpg' and we want the output to be 'Dog: beagle'.  

Later when we augment the training set, we'll name the edited images as 'name_am.jpg' and 'name_ca.jpg' where 'name' is the original name. So we'll design the format_name function to work on such file names as well. 

In [5]:
def cat_or_dog(inp):       # inp is supposed to be the file (breed) name minus the numbers, _, and the extension.
    if inp[0].isupper():
        return 'Cat: '+ inp.lower()
    else:
        return 'Dog: '+ inp

def format_name(inp):
    inp = os.path.splitext(inp)[0]
    out = re.sub('_',' ',inp)
    out = re.sub(r'[0-9]+','',out)
    
    if inp.endswith('am'):
        out = out[:-4]
    elif inp.endswith('ca'):
        out = out[:-4]
    else:
        out = out[:-1]
    
    return cat_or_dog(out)

print(format_name('Abyssinian_100.png'))
print()
print(format_name('Abyssinian_93_am.jpg'))
print()
print(format_name('Abyssinian_27_ca.jpg'))

Cat: abyssinian

Cat: abyssinian

Cat: abyssinian


In [6]:
annotations_dict = {}
for i in range(len(annotations_df)):
    if format_name(annotations_df.iloc[i,0]) not in annotations_dict:
        annotations_dict[format_name(annotations_df.iloc[i,0])] = tuple(annotations_df.iloc[i,1:])
annotations_dict

{'Cat: abyssinian': (1, 1, 1),
 'Dog: american bulldog': (2, 2, 1),
 'Dog: american pit bull terrier': (3, 2, 2),
 'Dog: basset hound': (4, 2, 3),
 'Dog: beagle': (5, 2, 4),
 'Cat: bengal': (6, 1, 2),
 'Cat: birman': (7, 1, 3),
 'Cat: bombay': (8, 1, 4),
 'Dog: boxer': (9, 2, 5),
 'Cat: british shorthair': (10, 1, 5),
 'Dog: chihuahua': (11, 2, 6),
 'Cat: egyptian mau': (12, 1, 6),
 'Dog: english cocker spaniel': (13, 2, 7),
 'Dog: english setter': (14, 2, 8),
 'Dog: german shorthaired': (15, 2, 9),
 'Dog: great pyrenees': (16, 2, 10),
 'Dog: havanese': (17, 2, 11),
 'Dog: japanese chin': (18, 2, 12),
 'Dog: keeshond': (19, 2, 13),
 'Dog: leonberger': (20, 2, 14),
 'Cat: maine coon': (21, 1, 7),
 'Dog: miniature pinscher': (22, 2, 15),
 'Dog: newfoundland': (23, 2, 16),
 'Cat: persian': (24, 1, 8),
 'Dog: pomeranian': (25, 2, 17),
 'Dog: pug': (26, 2, 18),
 'Cat: ragdoll': (27, 1, 9),
 'Cat: russian blue': (28, 1, 10),
 'Dog: saint bernard': (29, 2, 19),
 'Dog: samoyed': (30, 2, 20),
 

In [7]:
class return_annotations():
    def __init__(self,collection):
        self.collection = collection
        self.names = list(map(lambda x:format_name(x),self.collection))
        self.class_ids = [annotations_dict[pet][0] for pet in self.names]
        self.species_ids = [annotations_dict[pet][1] for pet in self.names]
        self.breed_ids = [annotations_dict[pet][2] for pet in self.names]
        
    def create_df(self):
        return pd.DataFrame({'Image file':self.collection,'Pet':self.names,'Class id':self.class_ids,'Species id':self.species_ids,'Breed id':self.breed_ids})


In [8]:
avoid_trimaps = []

with open('annotations_aug/avoid_trimaps.txt', 'r') as f:
    for line in f:
        name = line[:-1]
        avoid_trimaps.append(name)


# Original + alpha matted: Train:valid:test = 70:15:15

We have 7390 images (before augmentation). Now we'll make a random split of the images into three sets: train, validation, test, in the ratio 3:1:1 approximately.  We don't want any of the newly created images in the validation or test set. Afterwards we'll add some of the augmented images the train data set to create several data sets with different proportions of original and edited images, aiming to achieve the train:valid:test ratio mentioned above.

Here below and later as well, we'll often fix the random seed so we get the same results when the code is run again. To really ensure pseudorandomness however, we shouldn't do this.

In [10]:
np.random.seed(1)

images_train = []
images_valid = []
images_test = []
for img in list_images:
    choice = np.random.choice([1,2,3],p=[0.6,0.2,0.2])  
    if choice == 1:
        images_train.append(img)
    elif choice == 2:
        images_valid.append(img)
    elif choice == 3:
        images_test.append(img)
        

print('The training set has {} images which is {:.2f} % of the dataset.'.format(len(images_train),100*(len(images_train)/len(list_images))))
print('The validation set has {} images which is {:.2f} % of the dataset.'.format(len(images_valid),100*(len(images_valid)/len(list_images))))
print('The test set has {} images which is {:.2f} % of the dataset.'.format(len(images_test),100*(len(images_test)/len(list_images))))

The training set has 4449 images which is 60.20 % of the dataset.
The validation set has 1491 images which is 20.18 % of the dataset.
The test set has 1450 images which is 19.62 % of the dataset.


Now we'll prepare a dataframe with the annotations for each of these three sets of images. We'll do it by defining a class with some attributes carrying the different annotations, and a method to return the df organising all of them.

In [11]:
      
annot_train = return_annotations(images_train)
annot_valid = return_annotations(images_valid)
annot_test = return_annotations(images_test)

annot_train_df = annot_train.create_df()
annot_valid_df = annot_valid.create_df()
annot_test_df = annot_test.create_df()

Let us export the validation and test dataframes as csv files. We'll still be modifying the training set.

In [12]:
annot_valid_df.to_csv('annotations_aug/annotations_valid.csv',index=False)
annot_test_df.to_csv('annotations_aug/annotations_test.csv',index=False)

### Three augmented training sets (original + alpha matted)

We started out with roughly $200$ images per class, we moved about $40$ of each into the validation set, about another $40$ into the test set, and we augmented the remaining $120$ to create roughly $240$ images per class. We'll try to have about $190$ images per class in the training dataset, so we have approximately a $70:15:15$ train:valid:test ration. We'll make three different datasets wherein we'll make up the $190$ images per class from the original and the edited images in different proportions. 

We can think of this ratio of original to edited images as another hyperparameter. We'll go for $r_1= 60:40$, $r_2=50:50$, and $r_3=40:60$. For the set $i$, the number of original images included should be about $\frac{190 r_i}{1+r_i}$ and the number of edited ones about $\frac{190}{1+ r_i}$. Therefore each of the original images in the training should have a $\frac{19 r_i}{12(1+r_i)}$ chance of getting into the training set, and each edited image should have a $\frac{19}{12(1+r_i)}$ chance.  

In [20]:
def create_train_set(r):
    t_set = []
    for img in annot_train_df['Image file']:
        if np.random.choice([True,False],p=[(19*r)/(12*(1+r)),1-((19*r)/(12*(1+r)))]):
            t_set.append(img)
        if os.path.splitext(img)[0] not in avoid_trimaps:
            if np.random.choice([True,False],p=[19/(12*(1+r)),1-(19/(12*(1+r)))]):
                t_set.append(os.path.splitext(img)[0]+'_am.jpg')
            
    t_annot = return_annotations(t_set)
    return t_annot.create_df()        

In [21]:
np.random.seed(10)
train_r1_df = create_train_set(1.5)
train_r1_df.to_csv('annotations_aug/annotations_train_r1.csv',index=False)

np.random.seed(20000)
train_r2_df = create_train_set(1)
train_r2_df.to_csv('annotations_aug/annotations_train_r2.csv',index=False)

np.random.seed(50000)
train_r3_df = create_train_set(2/3)
train_r3_df.to_csv('annotations_aug/annotations_train_r3.csv',index=False)

Now let's get summarize the datasets available to us.

In [22]:
def summarize(train_df): 
    
    len_train = len(train_df)
    len_valid = len(images_valid)
    len_test = len(images_test)
    total = len_train + len_valid + len_test
    
    number_edited = sum([os.path.splitext(file)[0].endswith('am') for file in train_df['Image file']])
    number_original = len_train - number_edited
    
    print('There are {}, {}, and {} images in the training, validation, and test sets, respectively.'.format(len_train, len_valid, len_test))
    print()
    print('The train:valid:test ratio is roughly {:.2f}:{:.2f}:{:.2f}.'.format(len_train*100/total, len_valid*100/total, len_test*100/total))
    print()
    print('The ratio of original images to the edited ones is about {:.2f}:{:.2f}.'.format(number_original*100/len_train,number_edited*100/len_train))
    
    

In [23]:
summarize(train_r1_df)

There are 7109, 1491, and 1450 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 70.74:14.84:14.43.

The ratio of original images to the edited ones is about 59.46:40.54.


In [24]:
summarize(train_r2_df)

There are 6983, 1491, and 1450 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 70.36:15.02:14.61.

The ratio of original images to the edited ones is about 50.49:49.51.


In [25]:
summarize(train_r3_df)

There are 7102, 1491, and 1450 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 70.72:14.85:14.44.

The ratio of original images to the edited ones is about 40.78:59.22.


Let's also create another training dataset without rejecting any of the images available in the training set in annot_train_df

In [28]:
t_larger_set = []
for img in annot_train_df['Image file']:
    t_larger_set.append(img)
    if os.path.splitext(img)[0] not in avoid_trimaps:
        t_larger_set.append(os.path.splitext(img)[0]+'_am.jpg')
            
t_larger_annot = return_annotations(t_larger_set)
t_larger_annot_df = t_larger_annot.create_df()

t_larger_annot_df.to_csv('annotations_aug/annotations_train_240approx.csv',index=False)

summarize(t_larger_annot_df)

There are 8885, 1491, and 1450 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 75.13:12.61:12.26.

The ratio of original images to the edited ones is about 50.07:49.93.


# Original + alpha matted: Larger training sets

The training sets above seem to be inadequate for the models to learn to distinguish between different breeds (this is reasonable as different breeds in the same species are largely similar). So we will create some datasets with larger training sets and smaller validation and test sets and see if the performance is improved. 

In [30]:
np.random.seed(1)

imgL_train = []
imgL_valid = []
imgL_test = []
for img in list_images:
    choice = np.random.choice([1,2,3],p=[0.8,0.125,0.075])  
    if choice == 1:
        imgL_train.append(img)
    elif choice == 2:
        imgL_valid.append(img)
    elif choice == 3:
        imgL_test.append(img)
        

print('The training set has {} images which is {:.2f} % of the dataset.'.format(len(imgL_train),100*(len(imgL_train)/len(list_images))))
print('The validation set has {} images which is {:.2f} % of the dataset.'.format(len(imgL_valid),100*(len(imgL_valid)/len(list_images))))
print('The test set has {} images which is {:.2f} % of the dataset.'.format(len(imgL_test),100*(len(imgL_test)/len(list_images))))

The training set has 5940 images which is 80.38 % of the dataset.
The validation set has 919 images which is 12.44 % of the dataset.
The test set has 531 images which is 7.19 % of the dataset.


In [32]:
annotL_train = return_annotations(imgL_train)
annotL_valid = return_annotations(imgL_valid)
annotL_test = return_annotations(imgL_test)

annotL_train_df = annotL_train.create_df()
annotL_valid_df = annotL_valid.create_df()
annotL_test_df = annotL_test.create_df()

In [33]:
annotL_valid_df.to_csv('annotations_aug/annotations_valid_L.csv',index=False)
annotL_test_df.to_csv('annotations_aug/annotations_test_L.csv',index=False)

In [34]:
t_set_Lall = []
for img in annotL_train_df['Image file']:
    t_set_Lall.append(img)
    if os.path.splitext(img)[0] not in avoid_trimaps:
        t_set_Lall.append(os.path.splitext(img)[0]+'_am.jpg')
            
train_Lall = return_annotations(t_set_Lall)
train_Lall_df = train_Lall.create_df() 
train_Lall_df.to_csv('annotations_aug/annotations_train_Lall.csv',index=False)

In [36]:
def summarizeL(train_df): 
    
    len_train = len(train_df)
    len_valid = len(imgL_valid)
    len_test = len(imgL_test)
    total = len_train + len_valid + len_test
    
    number_edited = sum([os.path.splitext(file)[0].endswith('am') for file in train_df['Image file']])
    number_original = len_train - number_edited
    
    print('There are {}, {}, and {} images in the training, validation, and test sets, respectively.'.format(len_train, len_valid, len_test))
    print()
    print('The train:valid:test ratio is roughly {:.2f}:{:.2f}:{:.2f}.'.format(len_train*100/total, len_valid*100/total, len_test*100/total))
    print()
    print('The ratio of original images to the edited ones is about {:.2f}:{:.2f}.'.format(number_original*100/len_train,number_edited*100/len_train))
    
    

In [40]:
summarizeL(train_Lall_df)

There are 11860, 919, and 531 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 89.11:6.90:3.99.

The ratio of original images to the edited ones is about 50.08:49.92.


# No augmentation

We'll carry out some experiments based on the original images alone (after resizing to a uniform shape and size). So we'll prepare a dataset with roughly 180 images/per class in the training set, 12 images/class in the dev set, and 8 images/class in the test set. 

In [7]:
np.random.seed(1359)

imgO_train = []
imgO_valid = []
imgO_test = []
for img in list_images:
    choice = np.random.choice([1,2,3],p=[0.85,0.1,0.05])  
    if choice == 1:
        imgO_train.append(img)
    elif choice == 2:
        imgO_valid.append(img)
    elif choice == 3:
        imgO_test.append(img)
        

print('The training set has {} images which is {:.2f} % of the dataset.'.format(len(imgO_train),100*(len(imgO_train)/len(list_images))))
print('The validation set has {} images which is {:.2f} % of the dataset.'.format(len(imgO_valid),100*(len(imgO_valid)/len(list_images))))
print('The test set has {} images which is {:.2f} % of the dataset.'.format(len(imgO_test),100*(len(imgO_test)/len(list_images))))

The training set has 6308 images which is 85.36 % of the dataset.
The validation set has 720 images which is 9.74 % of the dataset.
The test set has 362 images which is 4.90 % of the dataset.


In [13]:
annotO_train = return_annotations(imgO_train)
annotO_valid = return_annotations(imgO_valid)
annotO_test = return_annotations(imgO_test)

annotO_train_df = annotO_train.create_df()
annotO_valid_df = annotO_valid.create_df()
annotO_test_df = annotO_test.create_df()

annotO_train_df.to_csv('annotations_aug/annotations_train_O.csv',index=False)
annotO_valid_df.to_csv('annotations_aug/annotations_valid_O.csv',index=False)
annotO_test_df.to_csv('annotations_aug/annotations_test_O.csv',index=False)

In [14]:
len_train = len(imgO_train)
len_valid = len(imgO_valid)
len_test = len(imgO_test)
total = len_train + len_valid + len_test
    

print('There are {}, {}, and {} images in the training, validation, and test sets, respectively.'.format(len_train, len_valid, len_test))
print()
print('The train:valid:test ratio is roughly {:.2f}:{:.2f}:{:.2f}.'.format(len_train*100/total, len_valid*100/total, len_test*100/total))
print()
print('There are no augmented images in this dataset.')
    

There are 6308, 720, and 362 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 85.36:9.74:4.90.

There are no augmented images in this dataset.


# Original + alpha matted + rotations and projective transforms

We'll make a couple of datasets from the original and two sets of augmented images. 

# B1

At first we'll take roughly 150 images per class in the training set, 30 in the validation and 20 in the test set. So we have roughly 450 training images per class including the augmented images for each image in the training set, which gives us a train:valid:test ratio of 90:6:4.

In [10]:
np.random.seed(14)

images_train = []
images_valid = []
images_test = []
for img in list_images:
    choice = np.random.choice([1,2,3],p=[0.75,0.15,0.1])  
    if choice == 1:
        images_train.append(img)
    elif choice == 2:
        images_valid.append(img)
    elif choice == 3:
        images_test.append(img)
        
        
print('The training set has {} original images which is {:.2f} % of the original dataset.'.format(len(images_train),100*(len(images_train)/len(list_images))))
print('The validation set has {} images which is {:.2f} % of the original dataset.'.format(len(images_valid),100*(len(images_valid)/len(list_images))))
print('The test set has {} images which is {:.2f} % of the original dataset.'.format(len(images_test),100*(len(images_test)/len(list_images))))

The training set has 5553 original images which is 75.14 % of the original dataset.
The validation set has 1117 images which is 15.12 % of the original dataset.
The test set has 720 images which is 9.74 % of the original dataset.


In [12]:
annot_train = return_annotations(images_train)
annot_valid = return_annotations(images_valid)
annot_test = return_annotations(images_test)

annot_train_df = annot_train.create_df()
annot_valid_df = annot_valid.create_df()
annot_test_df = annot_test.create_df()

annot_valid_df.to_csv('annotations_aug/annotations_valid_B1.csv',index=False)
annot_test_df.to_csv('annotations_aug/annotations_test_B1.csv',index=False)

In [14]:
t_set_B1 = []
for img in annot_train_df['Image file']:
    t_set_B1.append(img)
    if os.path.splitext(img)[0] not in avoid_trimaps:
        t_set_B1.append(os.path.splitext(img)[0]+'_am.jpg')
    t_set_B1.append(os.path.splitext(img)[0]+'_ca.jpg')
            
train_B1 = return_annotations(t_set_B1)
train_B1_df = train_B1.create_df() 
train_B1_df.to_csv('annotations_aug/annotations_train_B1.csv',index=False)

In [16]:
len_train = len(train_B1_df)
len_valid = len(images_valid)
len_test = len(images_test)
total = len_train + len_valid + len_test
    

print('There are {}, {}, and {} images in the training, validation, and test sets, respectively.'.format(len_train, len_valid, len_test))
print()
print('The train:valid:test ratio is roughly {:.2f}:{:.2f}:{:.2f}.'.format(len_train*100/total, len_valid*100/total, len_test*100/total))

    

There are 16642, 1117, and 720 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 90.06:6.04:3.90.


# B2

We'll take roughly 160 images per class in the training set, 25 in the validation and 15 in the test set. So we have roughly 480 training images per class including the augmented images for each image in the training set, which gives us a train:valid:test ratio of 92.3:4.8:2.9

In [9]:
np.random.seed(14)

images_train = []
images_valid = []
images_test = []
for img in list_images:
    choice = np.random.choice([1,2,3],p=[0.8,0.125,0.075])  
    if choice == 1:
        images_train.append(img)
    elif choice == 2:
        images_valid.append(img)
    elif choice == 3:
        images_test.append(img)
        
        
print('The training set has {} original images which is {:.2f} % of the original dataset.'.format(len(images_train),100*(len(images_train)/len(list_images))))
print('The validation set has {} images which is {:.2f} % of the original dataset.'.format(len(images_valid),100*(len(images_valid)/len(list_images))))
print('The test set has {} images which is {:.2f} % of the original dataset.'.format(len(images_test),100*(len(images_test)/len(list_images))))

The training set has 5883 original images which is 79.61 % of the original dataset.
The validation set has 960 images which is 12.99 % of the original dataset.
The test set has 547 images which is 7.40 % of the original dataset.


In [10]:
annot_train = return_annotations(images_train)
annot_valid = return_annotations(images_valid)
annot_test = return_annotations(images_test)

annot_train_df = annot_train.create_df()
annot_valid_df = annot_valid.create_df()
annot_test_df = annot_test.create_df()

annot_valid_df.to_csv('annotations_aug/annotations_valid_B2.csv',index=False)
annot_test_df.to_csv('annotations_aug/annotations_test_B2.csv',index=False)

In [11]:
t_set_B2 = []
for img in annot_train_df['Image file']:
    t_set_B2.append(img)
    if os.path.splitext(img)[0] not in avoid_trimaps:
        t_set_B2.append(os.path.splitext(img)[0]+'_am.jpg')
    t_set_B2.append(os.path.splitext(img)[0]+'_ca.jpg')
            
train_B2 = return_annotations(t_set_B2)
train_B2_df = train_B2.create_df() 
train_B2_df.to_csv('annotations_aug/annotations_train_B2.csv',index=False)

In [12]:
len_train = len(train_B2_df)
len_valid = len(images_valid)
len_test = len(images_test)
total = len_train + len_valid + len_test
    

print('There are {}, {}, and {} images in the training, validation, and test sets, respectively.'.format(len_train, len_valid, len_test))
print()
print('The train:valid:test ratio is roughly {:.2f}:{:.2f}:{:.2f}.'.format(len_train*100/total, len_valid*100/total, len_test*100/total))

    

There are 17630, 960, and 547 images in the training, validation, and test sets, respectively.

The train:valid:test ratio is roughly 92.13:5.02:2.86.
