In [42]:
import os
import datetime
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import category_encoders as ce

from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from tensorflow.keras import optimizers

import cv2
from PIL import Image

import Augmentor

In [2]:
pd.set_option('display.max_rows', None)

# Using Agnes Martin Catalog Data with Style, Motif, and Subject labels with images to train a CNN

**Process**
1. create a dataframe with the images and labels
2. split the dataset into train and test -> make sure the class distributions are the same for both
3. Build, train, test a baseline CNN model to predict the class of each image
4. Save the predictions (probability of each class) to be used as features in clustering model

## Step 1: create our feature matrix with our images and labels

In [3]:
# set path to data
data_path = '../src/Data/CSV_Files/'

# read csv file and print out first few rows
df = pd.read_csv(data_path+'catalog_data.csv')
df['id'] = df['id'].apply(lambda x: format(x, '.3f'))
df.head()

Unnamed: 0,id,link,title,no_pieces,circa,year_made,"medium (alpha, )",support,height (cm),width (cm),...,no_ex,no_exspaces,no_semsoloex,no_pub,red,green,blue,hue,saturation,light
0,1947.003,https://www.artifexpress.com/catalogues/agnes-...,Self Portrait,1,1,1947,encaustic,canvas,66.0,49.5,...,2,2,1,0,96,83,58,39,0.25,0.3
1,1947.006,https://www.artifexpress.com/catalogues/agnes-...,Portrait of Daphne Vaughn,1,1,1947,encaustic,canvas,50.8,40.6,...,2,4,0,4,122,65,36,20,0.54,0.31
2,1947.001,https://www.artifexpress.com/catalogues/agnes-...,Nude,1,0,1947,oil,canvas,50.8,40.6,...,3,6,0,4,125,122,73,57,0.26,0.39
3,1948.001,https://www.artifexpress.com/catalogues/agnes-...,Untitled,1,1,1948,encaustic,canvas,45.7,35.6,...,2,4,0,2,87,65,64,3,0.15,0.3
4,1949.001,https://www.artifexpress.com/catalogues/agnes-...,Untitled,1,1,1949,oil,masonite,25.6,53.3,...,2,4,0,6,91,48,38,11,0.41,0.25


In [4]:
# drop all columns except for id and subject
image_classes = df[['id', 'subject']]
image_classes.head()

Unnamed: 0,id,subject
0,1947.003,portrait
1,1947.006,portrait
2,1947.001,portrait
3,1948.001,still life
4,1949.001,abstract


In [15]:
image_classes['subject'].value_counts()

bands         255
repetition    196
grid           76
rectangle      47
square         19
symmetry       14
shape          10
circle          4
abstract        4
portrait        3
figure          3
gestural        2
landscape       2
na              2
still life      1
Name: subject, dtype: int64

In [67]:
class_name = image_classes['subject'].unique().tolist()

aug = {}

for subject in class_name:
    num = image_classes['subject'][image_classes['subject'] == subject].value_counts().values[0]
    num = 1000 - num
    aug[subject] = num
    
aug

{'portrait': 997,
 'still life': 999,
 'abstract': 996,
 'landscape': 998,
 'figure': 997,
 'shape': 990,
 'gestural': 998,
 'rectangle': 953,
 'square': 981,
 'symmetry': 986,
 'na': 998,
 'repetition': 804,
 'grid': 924,
 'circle': 996,
 'bands': 745}

# move images into folders for each class label (Subject)

In [64]:
for subject in class_name:
    temp = image_classes[image_classes["subject"] == subject]
    ids = temp['id'].values.tolist()
    subject = subject.replace(" ", "_")
    
    for a_id in ids:
        image_name = a_id + ".png"
        try:
            Path(f"./Data/Images/{image_name}").rename(f"./Data/Images/Subject/{subject}/{image_name}")
        except:
            pass

# load images, resize, created number of augmented images so we have balanced classes and more images to train on 

In [74]:
p = Augmentor.Pipeline('./Data/images/Subject/bands/')

Initialised with 252 image(s) found.
Output directory set to ./Data/images/Subject/bands/output.

In [75]:
# Defining augmentation parameters and generating 5 samples 
p.flip_left_right(0.5) 
p.black_and_white(0.15) 
p.rotate(0.6, 15, 15) 
p.skew(0.6, 0.5) 
p.zoom(probability = 0.3, min_factor = 1.4, max_factor = 2.0)
p.sample(1000)

Processing <PIL.Image.Image image mode=1 size=982x982 at 0x1B76FA597C8>: 100%|█| 1000/1000 [02:35<00:00,  6.43 Samples/


In [79]:
output_path = "./Data/Images/Subject/bands/output/"
subject = "bands"
f_names = os.listdir(output_path)
for i in range(len(f_names)):
    Path(output_path + f_names[i]).replace(f"{output_path}{subject}_{i+1}.png")

In [80]:
for subject in class_name:
    subject = subject.replace(" ", "_")
    path = f'./Data/images/Subject/{subject}/'
    p = Augmentor.Pipeline(path)
    
    # Defining augmentation parameters and generating 5 samples 
    p.flip_left_right(0.5) 
    p.black_and_white(0.15) b
    p.rotate(0.6, 15, 15) 
    p.skew(0.6, 0.5) 
    p.zoom(probability = 0.3, min_factor = 1.4, max_factor = 2.0)
    p.sample(1000)
    
    output_path = f"./Data/Images/Subject/{subject}/output/"
    f_names = os.listdir(output_path)
    for i in range(len(f_names)):
        Path(output_path + f_names[i]).replace(f"{output_path}{subject}_{i+1}.png")

Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 3 image(s) found.
Output directory set to ./Data/images/Subject/portrait/output.

Processing <PIL.Image.Image image mode=RGBA size=790x1024 at 0x1B76F86B448>: 100%|█| 1000/1000 [03:23<00:00,  4.90 Samp
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 1 image(s) found.
Output directory set to ./Data/images/Subject/still_life/output.

Processing <PIL.Image.Image image mode=1 size=834x1024 at 0x1B76F572B08>: 100%|█| 1000/1000 [03:21<00:00,  4.95 Samples
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 4 image(s) found.
Output directory set to ./Data/images/Subject/abstract/output.

Processing <PIL.Image.Image image mode=RGBA size=1026x758 at 0x1B76BE12288>: 100%|█| 1000/1000 [02:51<00:00,  5.83 Samp
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 2 image(s) found.
Output directory set to ./Data/images/Subject/landscape/output.

Processing <PIL.Image.Image image mode=RGBA size=1026x870 at 0x1B76F598B08>: 100%|█| 1000/1000 [03:36<00:00,  4.62 Samp
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 3 image(s) found.
Output directory set to ./Data/images/Subject/figure/output.

Processing <PIL.Image.Image image mode=RGBA size=1028x710 at 0x1B70036D0C8>: 100%|█| 1000/1000 [03:19<00:00,  5.02 Samp
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 10 image(s) found.
Output directory set to ./Data/images/Subject/shape/output.

Processing <PIL.Image.Image image mode=RGBA size=872x606 at 0x1B76E79BE08>: 100%|█| 1000/1000 [02:36<00:00,  6.40 Sampl
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 1 image(s) found.
Output directory set to ./Data/images/Subject/gestural/output.

Processing <PIL.Image.Image image mode=RGBA size=870x542 at 0x1B7003F3B48>: 100%|█| 1000/1000 [02:03<00:00,  8.12 Sampl
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 46 image(s) found.
Output directory set to ./Data/images/Subject/rectangle/output.

Processing <PIL.Image.Image image mode=RGBA size=982x976 at 0x1B7005A7208>: 100%|█| 1000/1000 [03:54<00:00,  4.26 Sampl
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 19 image(s) found.
Output directory set to ./Data/images/Subject/square/output.

Processing <PIL.Image.Image image mode=RGBA size=970x962 at 0x1B76FF968C8>: 100%|█| 1000/1000 [04:23<00:00,  3.79 Sampl
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 14 image(s) found.
Output directory set to ./Data/images/Subject/symmetry/output.

Processing <PIL.Image.Image image mode=RGBA size=972x984 at 0x1B76FD8AA88>: 100%|█| 1000/1000 [03:51<00:00,  4.31 Sampl


Initialised with 0 image(s) found.
Output directory set to ./Data/images/Subject/na/output.

IndexError: There are no images in the pipeline. Add a directory using add_directory(), pointing it to a directory containing images.

In [81]:
class_name_left = [
    'repetition',
    'grid',
    'circle',
    'bands'
]

for subject in class_name_left:
    
    path = f'./Data/images/Subject/{subject}/'
    p = Augmentor.Pipeline(path)
    
    # Defining augmentation parameters and generating 5 samples 
    p.flip_left_right(0.5) 
    p.black_and_white(0.15) 
    p.rotate(0.6, 15, 15) 
    p.skew(0.6, 0.5) 
    p.zoom(probability = 0.3, min_factor = 1.4, max_factor = 2.0)
    p.sample(1000)
    
    output_path = f"./Data/Images/Subject/{subject}/output/"
    f_names = os.listdir(output_path)
    for i in range(len(f_names)):
        Path(output_path + f_names[i]).replace(f"{output_path}{subject}_{i+1}.png")

Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 193 image(s) found.
Output directory set to ./Data/images/Subject/repetition/output.

Processing <PIL.Image.Image image mode=1 size=790x798 at 0x1B77006C108>: 100%|█| 1000/1000 [02:47<00:00,  5.98 Samples/
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 74 image(s) found.
Output directory set to ./Data/images/Subject/grid/output.

Processing <PIL.Image.Image image mode=RGBA size=990x994 at 0x1B76F8F0688>: 100%|█| 1000/1000 [03:30<00:00,  4.75 Sampl
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 4 image(s) found.
Output directory set to ./Data/images/Subject/circle/output.

Processing <PIL.Image.Image image mode=RGBA size=978x986 at 0x1B700253A88>: 100%|█| 1000/1000 [03:48<00:00,  4.37 Sampl
Executing Pipeline:   0%|                                                               | 0/1000 [00:00<?, ? Samples/s]

Initialised with 252 image(s) found.
Output directory set to ./Data/images/Subject/bands/output.

Processing <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=982x984 at 0x1B76E9AD3C8>: 100%|█| 1000/1000 [03:30<00
