In [2]:
import pandas as pd
from os.path import exists
import os
import re
from sklearn.model_selection import train_test_split
import shutil
import requests
import time

### Importing the Data from the National Gallery of Art (NGA)
This NB reflects Step 2 of the LaArt Pipeline. First I import the metadata for the latinamerican portion of the art, which is a CSV file produced in Step 1.

In [3]:
#import the metadata for La Art (this is a result of a MySql Database query script) with minor modifications
la_image_metadata = pd.read_csv('../data_samples/latinamerican_art.csv')
shape_initial = la_image_metadata.shape

Dropping Columns

In [4]:
#dropping unneccessary columns (1 removed)
la_image_metadata = la_image_metadata.drop(['accessioned'], axis = 1)
#dropping unneccessary rows (1 removed)
la_image_metadata = la_image_metadata.where(la_image_metadata.title.apply(pd.notna)).dropna(how='all')
la_image_metadata = la_image_metadata.where(la_image_metadata.forwarddisplayname.apply(pd.notna)).dropna(how='all')
la_image_metadata = la_image_metadata.where(la_image_metadata.objectid.apply(pd.notna)).dropna(how='all')

Adding Columns

In [5]:
# adding expected file name
la_image_metadata['file_name'] = la_image_metadata['title'].apply(lambda x: x.replace(' ','_').replace('/', '&')) + '_' + la_image_metadata['forwarddisplayname'].apply(lambda x: x.replace(' ', '_').replace('/', '&')) + '_' + la_image_metadata['objectid'].apply(lambda x: str(int(x)) + '.jpg')
#adding the expected root directory of the image files
la_image_directory = '../latinamerican-2-imagefolder-split/'
la_image_metadata['directory'] = [la_image_directory] * len(la_image_metadata)
#subfolder split (train 70% /test 30%) need to be identified randomly
train_data, test_data = train_test_split(la_image_metadata, test_size=0.3)
train_data['subfolder'] = ['train'] * len(train_data)
test_data['subfolder'] = ['test'] * len(test_data)
#adding new split (subfolder) column to la_image_metadata
la_image_metadata = pd.concat([train_data, test_data]).reset_index(drop=True)
# adding expected filepath (directory + filename)
la_image_metadata['image_fp'] = la_image_metadata.directory + la_image_metadata.subfolder + '/' + la_image_metadata.file_name

Check the change in shape for input in M.L algs.

In [7]:
#To verify data shape during process
shape_change_1 = la_image_metadata.shape
print('Shape starting: ', shape_initial)
print('Shape after edit 1: ', shape_change_1)

Shape starting:  (342, 47)
Shape after edit 1:  (341, 50)


### Saving the files edites / created in this NB

This is where data is saved and outputted. If the .py files are run using the .sh script then the data will be edited. This is commented since this is a notebook for testing.

In [None]:
# saves the selected parts to a new csv file to run the download script portion of downloadLa
#la_image_fpaths = la_image_metadata.loc[:, ['expanded_url', 'file_name', 'image_fp']]
#la_image_fpaths.to_csv('../data_samples/la_image_fpaths.csv', index=False)
#print('CSV Created: ../data_samples/la_image_fpaths.csv')
#la_image_metadata.to_csv('../data_samples/latinamerican_art.csv', index=False)
#print('CSV Edited: ../data_samples/latinamerican_art.csv')

### Downloading the images from the open source NGA Database API

This is how the images would be downloaded if ran as a script.

In [None]:
## for next code section in notebook, images will be downloaded
## I assume no images are downloaded & image_fp/directory not created
#def download_image(url, path, name, headers):
#    response = requests.get(url, headers=headers)
#    if response.status_code == 200:
#        with open(name, "wb") as f:
#            f.write(response.content)
#        shutil.move(name, path)
#    else:
#        print(response.status_code)
## Define HTTP Headers
#ua_header = {
#    "User-Agent": "Chrome/51.0.2704.103",
#}

# this block now uses the la_image_fpaths table
#print("Download starting... please wait :)")
## This block iterates over all the images
#for i in range(0, len(la_image_fpaths)):
#    # Define URL of an image
#    expanded_url = la_image_fpaths.expanded_url[i]
#    # Define image file name, file path to place
#    file_name = la_image_fpaths.file_name[i]
#    fp = la_image_fpaths.image_fp[i]
#    # Download image
#    # timer delay (15 seconds)
#    time.sleep(15)
#    download_image(expanded_url, fp, file_name, ua_header)

### Verifying File Names Correspond to the Dataset (Some Images Unable to be Downloaded)

In [33]:
#checking that the filepath / naming conventions I used are consistent
#from os.path import exists
file_exists = []
for i in range(len(la_image_metadata)):
    directory = la_image_metadata.directory[i]
    subfolder = la_image_metadata.subfolder[i]
    filename = la_image_metadata.file_name[i]
    full = directory + subfolder + '/' + filename
    file_exists.append(exists(full))

In [34]:
file_exists = pd.Series(file_exists, name='imagefp_exists')

In [45]:
la_image_metadata['imagefp_exists'] = file_exists

In [51]:
validLa_image_fpaths = la_image_metadata.loc[:, ['image_fp', 'imagefp_exists', 'objectid']]

In [53]:
validLa_image_fpaths.to_csv('../data_samples/validLa_image_fpaths.csv', index=False)

In [44]:
perc_exists = file_exists.sum()/len(file_exists)
total = 341
whole_num_exists = perc_exists * total
text = 'The amount of images downloaded is {} percent. Which means {} is amount downloaded, out of {} in latinamerican_art.csv'
text.format(perc_exists, whole_num_exists, total)


'The amount of images downloaded is 0.6217008797653959 percent. Which means 212.0 is amount downloaded, out of 341 in latinamerican_art.csv'

In [27]:
np.median(predictions.loc[:, 'Image Accuracy - ResNet V2 (%)'])

13.693586736917496