In [29]:
import os
from pathlib import Path
import shutil

import pandas as pd

# Setup directories

In [31]:
project_root_dir = str(Path(os.getcwd()).parents[0])

data_dir = os.path.join(project_root_dir, 'data')
datasets_dir = os.path.join(data_dir, 'datasets')

src_images_dir = os.path.join(data_dir, 'all_images_resized_256')

# Train and Test split

## Artist dataset

In [13]:
# load dataset info
artist_dataset_dir = os.path.join(datasets_dir, 'artists')

all_artists_df = pd.read_csv(os.path.join(artist_dataset_dir, 'csv_files', 'all_data.csv'))
all_artists_df.head()

Unnamed: 0.1,Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,14,Ivan Aivazovsky,1873.0,marina,7444.0,5811.0,3189597.0,wikiart,Romanticism,Storm at sea,train_and_test,True,99442.jpg
1,28,Gustave Dore,1866.0,religious painting,4915.0,6068.0,29811319.0,wikiart,Romanticism,Death of Eleazer,train_and_test,True,7486.jpg
2,29,Gustave Dore,,religious painting,4770.0,6219.0,32443139.0,wikiart,Romanticism,The-Deluge,train_and_test,True,35766.jpg
3,34,Gustave Dore,,illustration,5878.0,4678.0,24158097.0,wikiart,Romanticism,The hoarders and wasters,train_and_test,False,31977.jpg
4,82,Ivan Aivazovsky,1850.0,marina,5815.0,3840.0,16019462.0,wikiart,Romanticism,The Ninth Wave,train_and_test,True,81750.jpg


In [14]:
# sanity checks
assert len(all_artists_df) == 9987, 'Rows seem to be missing'
assert all_artists_df['artist'].nunique() == 20, 'Dataset has 20 artists'

In [26]:
# split into train and test
artists_train_df = all_artists_df.groupby(['artist']).sample(n=450, random_state=42)
artists_train_index = artists_train_df.index

artists_test_df = all_artists_df.loc[~all_artists_df.index.isin(artists_train_index)]

assert len(artists_train_df) + len(artists_test_df) == len(all_artists_df)

In [27]:
# write train and test data to disk
artists_train_df.to_csv(os.path.join(artist_dataset_dir, 'csv_files', 'train_data.csv'))
artists_test_df.to_csv(os.path.join(artist_dataset_dir, 'csv_files', 'test_data.csv'))

In [36]:
# load data back in and move images
train_df = pd.read_csv(os.path.join(artist_dataset_dir, 'csv_files', 'train_data.csv'))
test_df = pd.read_csv(os.path.join(artist_dataset_dir, 'csv_files', 'test_data.csv'))

artists_images_dir = os.path.join(artist_dataset_dir, 'images')
artists_train_images_dir = os.path.join(artists_images_dir, 'train')
artists_test_images_dir = os.path.join(artists_images_dir, 'test')

os.makedirs(artists_train_images_dir, exist_ok=True)
os.makedirs(artists_test_images_dir, exist_ok=True)

for image_fname in train_df['new_filename']:
    src_image_fpath = os.path.join(src_images_dir, image_fname)
    dst_image_fpath = os.path.join(artists_train_images_dir, image_fname)

    shutil.copy(src_image_fpath, dst_image_fpath)
    # break

for image_fname in test_df['new_filename']:
    src_image_fpath = os.path.join(src_images_dir, image_fname)
    dst_image_fpath = os.path.join(artists_test_images_dir, image_fname)

    shutil.copy(src_image_fpath, dst_image_fpath)
    # break

assert len(os.listdir(artists_train_images_dir)) + len(os.listdir(artists_test_images_dir)) == 9987

# Genres dataset

In [38]:
# load dataset info
genres_dataset_dir = os.path.join(datasets_dir, 'genres')

all_genres_df = pd.read_csv(os.path.join(genres_dataset_dir, 'csv_files', 'all_data.csv'))
all_genres_df.head()

Unnamed: 0.1,Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,96422,Al Held,1966.0,abstract,416.0,480.0,50341.0,wikiart,Hard Edge Painting,Acracropolis,train_and_test,False,32950.jpg
1,92745,Carlos Merida,1977.0,abstract,400.0,570.0,172366.0,wikiart,Cubism,El doble,train_only,True,524.jpg
2,86479,John McLaughlin,1948.0,abstract,563.0,480.0,111591.0,wikiart,Neoplasticism,Untitled,train_only,True,70495.jpg
3,72064,Jack Tworkov,1954.0,abstract,586.0,720.0,224375.0,wikiart,Abstract Expressionism,Pink Mississippi,train_and_test,False,69560.jpg
4,92680,Martin Barre,,abstract,457.0,500.0,85432.0,wikiart,Minimalism,unknown title,train_only,True,16073.jpg


In [41]:
# sanity checks
assert len(all_genres_df) == 20000, 'Rows seem to be missing'
assert all_genres_df['genre'].nunique() == 10, 'Dataset has 10 genres'

In [42]:
# split into train and test
genres_train_df = all_genres_df.groupby(['genre']).sample(n=1800, random_state=42)
genres_train_index = genres_train_df.index

genres_test_df = all_genres_df.loc[~all_genres_df.index.isin(genres_train_index)]

assert len(genres_train_df) + len(genres_test_df) == len(all_genres_df)

In [44]:
# write train and test data to disk
genres_train_df.to_csv(os.path.join(genres_dataset_dir, 'csv_files', 'train_data.csv'))
genres_test_df.to_csv(os.path.join(genres_dataset_dir, 'csv_files', 'test_data.csv'))

In [46]:
# load data back in and move images
train_df = pd.read_csv(os.path.join(genres_dataset_dir, 'csv_files', 'train_data.csv'))
test_df = pd.read_csv(os.path.join(genres_dataset_dir, 'csv_files' 'test_data.csv'))

genres_images_dir = os.path.join(genres_dataset_dir, 'images')
genres_train_images_dir = os.path.join(genres_images_dir, 'train')
genres_test_images_dir = os.path.join(genres_images_dir, 'test')

os.makedirs(genres_train_images_dir, exist_ok=True)
os.makedirs(genres_test_images_dir, exist_ok=True)

for image_fname in train_df['new_filename']:
    src_image_fpath = os.path.join(src_images_dir, image_fname)
    dst_image_fpath = os.path.join(genres_train_images_dir, image_fname)

    shutil.copy(src_image_fpath, dst_image_fpath)
    # break

for image_fname in test_df['new_filename']:
    src_image_fpath = os.path.join(src_images_dir, image_fname)
    dst_image_fpath = os.path.join(genres_test_images_dir, image_fname)

    shutil.copy(src_image_fpath, dst_image_fpath)
    # break

assert len(os.listdir(genres_train_images_dir)) + len(os.listdir(genres_test_images_dir)) == 20000

# Styles dataset

In [48]:
# load dataset info
styles_dataset_dir = os.path.join(datasets_dir, 'styles')

all_styles_df = pd.read_csv(os.path.join(styles_dataset_dir, 'csv_files', 'all_data.csv'))
all_styles_df.head()

Unnamed: 0.1,Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,82497,Arthur Pinajian,1960.0,abstract,650.0,466.0,100457.0,wikiart,Abstract Expressionism,Untitled,train_and_test,True,102700.jpg
1,62187,George Stefanescu-Ramnic,2002.0,religious painting,657.0,800.0,315707.0,wikiart,Abstract Expressionism,Crucifixion,train_and_test,True,23963.jpg
2,99488,Clyfford Still,1952.0,abstract,378.0,475.0,29950.0,wikiart,Abstract Expressionism,Untitled,train_only,True,793.jpg
3,91568,Ralph Rosenborg,1966.0,abstract,600.0,394.0,109968.0,wikiart,Abstract Expressionism,An Italian Landscape with Trees,train_only,True,76589.jpg
4,38573,William Scott,1960.0,abstract,944.0,877.0,90544.0,wikiart,Abstract Expressionism,Morning in Mykonos,train_and_test,True,41515.jpg


In [49]:
# sanity checks
assert len(all_styles_df) == 20000, 'Rows seem to be missing'
assert all_styles_df['style'].nunique() == 20, 'Dataset has 20 styles'

In [51]:
# split into train and test
styles_train_df = all_styles_df.groupby(['style']).sample(n=900, random_state=42)
styles_train_index = styles_train_df.index

styles_test_df = all_styles_df.loc[~all_styles_df.index.isin(styles_train_index)]

assert len(styles_train_df) + len(styles_test_df) == len(all_styles_df)

In [52]:
# write train and test data to disk
styles_train_df.to_csv(os.path.join(styles_dataset_dir, 'csv_files', 'train_data.csv'))
styles_test_df.to_csv(os.path.join(styles_dataset_dir, 'csv_files', 'test_data.csv'))

In [54]:
# load data back in and move images
train_df = pd.read_csv(os.path.join(styles_dataset_dir, 'csv_files', 'train_data.csv'))
test_df = pd.read_csv(os.path.join(styles_dataset_dir, 'csv_files' 'test_data.csv'))

styles_images_dir = os.path.join(styles_dataset_dir, 'images')
styles_train_images_dir = os.path.join(styles_images_dir, 'train')
styles_test_images_dir = os.path.join(styles_images_dir, 'test')

os.makedirs(styles_train_images_dir, exist_ok=True)
os.makedirs(styles_test_images_dir, exist_ok=True)

for image_fname in train_df['new_filename']:
    src_image_fpath = os.path.join(src_images_dir, image_fname)
    dst_image_fpath = os.path.join(styles_train_images_dir, image_fname)

    shutil.copy(src_image_fpath, dst_image_fpath)
    # break

for image_fname in test_df['new_filename']:
    src_image_fpath = os.path.join(src_images_dir, image_fname)
    dst_image_fpath = os.path.join(styles_test_images_dir, image_fname)

    shutil.copy(src_image_fpath, dst_image_fpath)
    # break

assert len(os.listdir(styles_train_images_dir)) + len(os.listdir(styles_test_images_dir)) == 20000