In [75]:
import pandas as pd
from os.path import exists
import os
import re
from sklearn.model_selection import train_test_split
import shutil
import requests
import time
import numpy as np

In [97]:
#using a premade subset/sample that is representative in diversity in country origin as parent dataset (excluding latin america)
nonLa_image_metadata_sample = pd.read_csv('../../../data_samples/results/processed_subset_results/non_latinamericanart_sample.csv', low_memory=False)

In [98]:
#dropping unneccessary rows (1 removed)
nonLa_image_metadata_sample = nonLa_image_metadata_sample.where(nonLa_image_metadata_sample.title.apply(pd.notna)).dropna(how='all')
nonLa_image_metadata_sample = nonLa_image_metadata_sample.where(nonLa_image_metadata_sample.forwarddisplayname.apply(pd.notna)).dropna(how='all')
nonLa_image_metadata_sample = nonLa_image_metadata_sample.where(nonLa_image_metadata_sample.objectid.apply(pd.notna)).dropna(how='all')

### Image Scraping

In [99]:
shape_initial = nonLa_image_metadata_sample.shape

In [100]:
shape_initial

(34863, 51)

Adding Columns

In [101]:
# adding expected file name
nonLa_image_metadata_sample['file_name'] = nonLa_image_metadata_sample['title'].apply(lambda x: x.replace(' ','_').replace('/', '&')) + '_' + nonLa_image_metadata_sample['forwarddisplayname'].apply(lambda x: x.replace(' ', '_').replace('/', '&')) + '_' + nonLa_image_metadata_sample['objectid'].apply(lambda x: str(int(x)) + '.jpg')
#adding the expected root directory of the image files
la_image_directory = '../../../latinamerican-2-imagefolder-split/'
nonLa_image_metadata_sample['directory'] = [la_image_directory] * len(nonLa_image_metadata_sample)
#subfolder split (train 70% /test 30%) need to be identified randomly
train_data, test_data = train_test_split(nonLa_image_metadata_sample, test_size=0.3)
train_data['subfolder'] = ['train'] * len(train_data)
test_data['subfolder'] = ['test'] * len(test_data)
#adding new split (subfolder) column to la_image_metadata
nonLa_image_metadata_sample = pd.concat([train_data, test_data]).reset_index(drop=True)
# adding expected filepath (directory + filename)
nonLa_image_metadata_sample['image_fp'] = nonLa_image_metadata_sample.directory + nonLa_image_metadata_sample.subfolder + '/' + nonLa_image_metadata_sample.file_name

Check the change in shape for input in M.L algs.

In [102]:
#To verify data shape during process
shape_change_1 = nonLa_image_metadata_sample.shape
print('Shape starting: ', shape_initial)
print('Shape after edit 1: ', shape_change_1)

Shape starting:  (34863, 51)
Shape after edit 1:  (34863, 55)


### Saving the files edites / created in this NB

This is where data is saved and outputted. If the .py files are run using the .sh script then the data will be edited.

In [103]:
# saves the selected parts to a new csv file to run the download script portion of downloadLa
nonLa_image_fpaths = nonLa_image_metadata_sample.loc[:, ['objectid', 'file_name', 'image_fp']]
nonLa_image_fpaths.to_csv('../../../data_samples/art_tables_test/nonla_image_fpaths_sample.csv', index=False)
print('CSV Created: ../../../data_samples/art_tables_test/nonla_image_fpaths_sample.csv')
nonLa_image_metadata_sample.to_csv('../../../data_samples/art_tables_test/non_latinamerican_art_sample.csv', index=False)
print('CSV Edited: ../../../data_samples/art_tables_test/non_latinamerican_art_sample.csv')

CSV Created: ../../../data_samples/art_tables_test/nonla_image_fpaths_sample.csv
CSV Edited: ../../../data_samples/art_tables_test/non_latinamerican_art_sample.csv


### Downloading the images from the open source NGA Database API

This is how the images would be downloaded if ran as a script.

In [122]:
nonLa_image_metadata_sample.role.value_counts()

artist                       13771
previous owner                6526
source                        5880
donor                         5364
printer                       1388
publisher                      890
artist after                   230
painter                        225
sculptor                       180
edition production              67
designer                        66
manufacturer                    52
technical collaborator          40
architect                       35
author                          25
medalist                        19
editor                          17
photographer                    14
processor                       12
related artist                  12
processing and proofing          8
engraver/modeler                 8
collaborator                     6
die engraver                     5
collaborator & supervisor        5
engraver                         5
cabinetmaker                     4
ceramist                         3
quiltmaker          

In [119]:

nonLa_image_metadata_sample.where(nonLa_image_metadata_sample.forwarddisplayname.apply(lambda x: 'Julius' in x)).dropna(how='all')

Unnamed: 0,uuid,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,...,pct_country_NGA,Country,Continent,Country Name,countries_missing,expanded_url,file_name,directory,subfolder,image_fp
7,7fcd2920-4d0f-4c0f-b184-45574ab7cd4c,https://api.nga.gov/iiif/7fcd2920-4d0f-4c0f-b1...,https://api.nga.gov/iiif/7fcd2920-4d0f-4c0f-b1...,1.0,Aesacus and Hesperie,"American, 1891 - 1979",donor,donor,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/7fcd2920-4d0f-4c0f-b1...,Aesacus_and_Hesperie_Lessing_Julius_Rosenwald_...,../../../latinamerican-2-imagefolder-split/,train,../../../latinamerican-2-imagefolder-split/tra...
8,38eda221-c29b-4f68-9a14-e6ee5a75ad9e,https://api.nga.gov/iiif/38eda221-c29b-4f68-9a...,https://api.nga.gov/iiif/38eda221-c29b-4f68-9a...,1.0,Two Brooches with Bird and Winged Snake at Top...,"American, 1891 - 1979",donor,donor,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/38eda221-c29b-4f68-9a...,Two_Brooches_with_Bird_and_Winged_Snake_at_Top...,../../../latinamerican-2-imagefolder-split/,train,../../../latinamerican-2-imagefolder-split/tra...
9,e0001381-dc13-4ec9-9a44-d98371cdec9c,https://api.nga.gov/iiif/e0001381-dc13-4ec9-9a...,https://api.nga.gov/iiif/e0001381-dc13-4ec9-9a...,1.0,Little Smithfield,"American, 1891 - 1979",owner,previous owner,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/e0001381-dc13-4ec9-9a...,Little_Smithfield_Lessing_Julius_Rosenwald_109...,../../../latinamerican-2-imagefolder-split/,train,../../../latinamerican-2-imagefolder-split/tra...
10,b422c844-15a6-427c-87e6-c76794b606b7,https://api.nga.gov/iiif/b422c844-15a6-427c-87...,https://api.nga.gov/iiif/b422c844-15a6-427c-87...,1.0,Au fil de la marne,"American, 1891 - 1979",donor,donor,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/b422c844-15a6-427c-87...,Au_fil_de_la_marne_Lessing_Julius_Rosenwald_71...,../../../latinamerican-2-imagefolder-split/,train,../../../latinamerican-2-imagefolder-split/tra...
14,ee182ab6-9885-4edc-9f5a-0e44f692e8dd,https://api.nga.gov/iiif/ee182ab6-9885-4edc-9f...,https://api.nga.gov/iiif/ee182ab6-9885-4edc-9f...,1.0,Sheet of Studies. Profiles of Marie-Thérèse an...,"American, 1891 - 1979",owner,previous owner,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/ee182ab6-9885-4edc-9f...,Sheet_of_Studies._Profiles_of_Marie-Thérèse_an...,../../../latinamerican-2-imagefolder-split/,train,../../../latinamerican-2-imagefolder-split/tra...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34849,e1f54067-b73a-4e53-9430-f3539cf9c62b,https://api.nga.gov/iiif/e1f54067-b73a-4e53-94...,https://api.nga.gov/iiif/e1f54067-b73a-4e53-94...,1.0,Saint George Liberating the Princess,"American, 1891 - 1979",donor,source,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/e1f54067-b73a-4e53-94...,Saint_George_Liberating_the_Princess_Lessing_J...,../../../latinamerican-2-imagefolder-split/,test,../../../latinamerican-2-imagefolder-split/tes...
34850,cc210495-9964-48cc-ac69-a6ddbfc0584a,https://api.nga.gov/iiif/cc210495-9964-48cc-ac...,https://api.nga.gov/iiif/cc210495-9964-48cc-ac...,1.0,Hercules and the Cretan Bull,"American, 1891 - 1979",donor,donor,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/cc210495-9964-48cc-ac...,Hercules_and_the_Cretan_Bull_Lessing_Julius_Ro...,../../../latinamerican-2-imagefolder-split/,test,../../../latinamerican-2-imagefolder-split/tes...
34852,963d08ef-7b69-4b75-a68d-a726e17a66b2,https://api.nga.gov/iiif/963d08ef-7b69-4b75-a6...,https://api.nga.gov/iiif/963d08ef-7b69-4b75-a6...,1.0,Marechal de Turenne,"American, 1891 - 1979",donor,donor,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/963d08ef-7b69-4b75-a6...,Marechal_de_Turenne_Lessing_Julius_Rosenwald_9...,../../../latinamerican-2-imagefolder-split/,test,../../../latinamerican-2-imagefolder-split/tes...
34853,3658e42f-e713-4674-88c8-c1066eb20eb0,https://api.nga.gov/iiif/3658e42f-e713-4674-88...,https://api.nga.gov/iiif/3658e42f-e713-4674-88...,1.0,"Jaime Sabartes' Dans l'Atelier de Picasso""""","American, 1891 - 1979",owner,previous owner,Lessing Julius Rosenwald,1891.0,...,0.70489,US,NoA,United States,38.0,https://api.nga.gov/iiif/3658e42f-e713-4674-88...,"Jaime_Sabartes'_Dans_l'Atelier_de_Picasso""""_Le...",../../../latinamerican-2-imagefolder-split/,test,../../../latinamerican-2-imagefolder-split/tes...


In [107]:
import shutil
import requests
def download_image(url, path, name, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(name, "wb") as f:
            f.write(response.content)
        shutil.move(name, path)
    else:
        print(response.status_code)
        
#Define HTTP Headers
ua_header = {
    "User-Agent": "Chrome/51.0.2704.103",
}
# this block now uses the la_image_fpaths table
print("Download starting... please wait :)")
for i in range(0, len(nonLa_image_fpaths)):
    
    # Define URL of an image
    expanded_url = nonLa_image_metadata_sample.expanded_url[i]
    # Define image file name, file path to place
    file_name = nonLa_image_fpaths.file_name[i]
    fp = nonLa_image_fpaths.image_fp[i]
    # Download image
    # timer delay (15 seconds)
    time.sleep(15)
    download_image(expanded_url, fp, file_name, ua_header)
    
    
print("Download finished.")

Download starting... please wait :)


OSError: [Errno 36] File name too long: "Et_l'homme_parut,_interrogeant_le_sol_d'ou__il_sort_et_qui_l'attire,_il_se_fraya_la_voie_vers_(And_Man_appeared;_questioning_theearth_from_which_he_emerged_and_which_attracts_hi_m,_he_made_his_way_toward_somber_brightness)_Lessing_Julius_Rosenwald_40263.jpg"

In [79]:
pa_nonLa_art.title = pa_nonLa_art.title.fillna('missing')
pa_nonLa_art.lastname = pa_nonLa_art.lastname.fillna('missing')
pa_nonLa_art['file_name'] = pa_nonLa_art.title.apply(lambda x: x.replace(' ', '_').replace('/', '&'))  + '_' + pa_nonLa_art.lastname.apply(lambda x: x.replace(' ', '_').replace('/', '&')) + '.jpg'

### Verifying File Names Correspond to the Dataset (Some Images Unable to be Downloaded)

In [81]:
pa_nonLa_art['directory'] = ['../../non_laImages'] * len(pa_nonLa_art)

In [82]:
#checking that the filepath / naming conventions I used are consistent
from os.path import exists
file_exists = []
for i in range(len(pa_nonLa_art)):
    directory = pa_nonLa_art.directory[i]
    filename = pa_nonLa_art.file_name[i]
    full = directory + '/' + filename
    file_exists.append(exists(full))
pa_nonLa_art['file_downloaded'] = file_exists

In [84]:
pa_nonLa_art.file_downloaded.sum() / len(pa_nonLa_art)

0.9883603238866396

In [85]:
#dropping columns where file wasn't accessible
pa_nonLa_art = pa_nonLa_art.where(pa_nonLa_art.file_downloaded).dropna(how='all')

In [89]:
pa_nonLa_art.to_csv('../../data_samples/results/processed_subset_results/subsampled_nonLa_art.csv', index=False)