In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import category_encoders as ce

from sklearn.model_selection import train_test_split

# Using Agnes Martin Catalog Data with Style, Motif, and Subject labels with images to train a CNN

**Process**
1. create a dataframe with the images and labels
2. split the dataset into train and test -> make sure the class distributions are the same for both
3. Build, train, test a baseline CNN model to predict the class of each image
4. Save the predictions (probability of each class) to be used as features in clustering model

## Step 1: create our feature matrix with our images and labels

In [25]:
# set path to data
data_path = 'src/Data/CSV_Files/'

# read csv file and print out first few rows
df = pd.read_csv(data_path+'catalog_data.csv')
df.head()

Unnamed: 0,id,link,title,no_pieces,circa,year_made,"medium (alpha, )",support,height (cm),width (cm),...,no_ex,no_exspaces,no_semsoloex,no_pub,red,green,blue,hue,saturation,light
0,1947.003,https://www.artifexpress.com/catalogues/agnes-...,Self Portrait,1,1,1947,encaustic,canvas,66.0,49.5,...,2,2,1,0,96,83,58,39,0.25,0.3
1,1947.006,https://www.artifexpress.com/catalogues/agnes-...,Portrait of Daphne Vaughn,1,1,1947,encaustic,canvas,50.8,40.6,...,2,4,0,4,122,65,36,20,0.54,0.31
2,1947.001,https://www.artifexpress.com/catalogues/agnes-...,Nude,1,0,1947,oil,canvas,50.8,40.6,...,3,6,0,4,125,122,73,57,0.26,0.39
3,1948.001,https://www.artifexpress.com/catalogues/agnes-...,Untitled,1,1,1948,encaustic,canvas,45.7,35.6,...,2,4,0,2,87,65,64,3,0.15,0.3
4,1949.001,https://www.artifexpress.com/catalogues/agnes-...,Untitled,1,1,1949,oil,masonite,25.6,53.3,...,2,4,0,6,91,48,38,11,0.41,0.25


In [14]:
# drop all columns except for id and subject
image_classes = df[['id', 'subject']]
image_classes.head()

Unnamed: 0,id,subject
0,1947.003,portrait
1,1947.006,portrait
2,1947.001,portrait
3,1948.001,still life
4,1949.001,abstract


In [13]:
# look at distribution of classes
df['subject'].value_counts()

bands         255
repetition    196
grid           76
rectangle      47
square         19
symmetry       14
shape          10
circle          4
abstract        4
figure          3
portrait        3
na              2
gestural        2
landscape       2
still life      1
Name: subject, dtype: int64

In [31]:
# drop still life from dataframe before split, need more than 2 for stratified split
still_life = df[df['subject'] == 'still life'] 
df = df[df['subject'] != 'still life']

In [32]:
# split into X features and y target -> style labels are the target classes we are trying to predict and 
# the id's will be used to get images which are the features for our model.
X = df['id']
y = df['subject']

# split into train and test (80/20), stratified = y to make sure we keep the class distributions for train/test the same
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20)

# print out the shape of X and y for the train and test data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((509,), (128,), (509,), (128,))

In [50]:
# look at the distribution of classes fot the training set
y_train.value_counts(normalize=True)*100

bands         40.078585
repetition    30.844794
grid          11.984283
rectangle      7.269155
square         2.946955
symmetry       2.161100
shape          1.571709
circle         0.589391
abstract       0.589391
na             0.392927
figure         0.392927
portrait       0.392927
gestural       0.392927
landscape      0.392927
Name: subject, dtype: float64

In [51]:
# look at the distribution of classes fot the testing set
y_test.value_counts(normalize=True)*100

bands         39.84375
repetition    30.46875
grid          11.71875
rectangle      7.81250
square         3.12500
symmetry       2.34375
shape          1.56250
circle         0.78125
abstract       0.78125
figure         0.78125
portrait       0.78125
Name: subject, dtype: float64

In [42]:
# function to encode categories and rename columns
def encode_rename_cols(data, cols):
    ce_one_hot = ce.OneHotEncoder(cols = cols, use_cat_names=True)
    features = ce_one_hot.fit_transform(data)
    cols = features.columns.tolist()
    cols = [cat.replace('subject_', '') for cat in cols]
    features.columns = cols
    return features

In [52]:
# encode the style column for train and test data
y_train_encoded = encode_rename_cols(y_train, ['subject'])
y_test_encoded = encode_rename_cols(y_test, ['subject'])

# print out the shapes of the encoded train and test
# y_train should have paintings for all subjects while y_test will have less
y_train_encoded.shape, y_test_encoded.shape

((509, 14), (128, 11))

## Setting up the Convolutional Neural Network

**Basic Architecture**