In [25]:
import pandas as pd
import os
import re

### Identifying location of images in the directory structure for editing

In [2]:
#import the metadata for La Art after moving into the new directory
la_image_metadata = pd.read_csv('../data_samples/results/whole_set_results/pa_latin_art.csv')

In [3]:
la_image_metadata = la_image_metadata.drop(['accessioned', 'displaydate'], axis = 1)

In [4]:
proportion_ofCountry = la_image_metadata['Country Name'].value_counts(normalize=True)
la_image_metadata['percent_fromCountry'] = la_image_metadata['Country Name'].apply(lambda x: proportion_ofCountry[x])
la_image_metadata = la_image_metadata.rename({'Percent_in_NGA': 'percent_fromArtist'}, axis=1)

In [5]:
la_image_directory = '../latinamerican-2-imagefolder-split/'
la_image_metadata['directory'] = [la_image_directory] * len(la_image_metadata)

In [6]:
#using a file path searching algorithm to add the subdirectories the files exist in to make the full filepath
#this function requires the name to be formatted with extension
def find_files(filename, search_path):
    result = []

    # Walking top-down from the root
    for root, dir, files in os.walk(search_path):
        if filename in files:
            result.append(os.path.join(root, filename))
    if len(result) == 1:
        return result[0]
    else:
        return 'Not Found'

In [8]:
la_image_metadata['image_fp'] = la_image_metadata.file_name.apply(lambda x: find_files(x, la_image_directory))

In [9]:
available_data = la_image_metadata.where(la_image_metadata.image_fp != 'Not Found').dropna(how='all')

In [12]:
available_data['old_file_name'] = available_data.file_name.copy()

In [18]:
#selecting columns with title, lastname, and objectid present
available_data = available_data.where(available_data.title.apply(pd.notna)).dropna(how='all')
available_data = available_data.where(available_data.lastname.apply(pd.notna)).dropna(how='all')
available_data = available_data.where(available_data.objectid.apply(pd.notna)).dropna(how='all')

In [22]:
available_data['file_name'] = available_data['title'].apply(lambda x: x.replace(' ','_').replace('/', '&')) + '_' + available_data['lastname'].apply(lambda x: x.replace(' ', '_').replace('/', '&')) + '_' + available_data['objectid'].apply(lambda x: str(int(x)) + '.jpg')

In [33]:
available_data['subfolder'] = available_data.image_fp.apply(lambda x: re.search('train|test', x)[0])

### Identifying the duplicates which causes a reduction in the num of images downloaded due to file name overwriting issue (duplicates have same name, author) & redownloading with appropriate reidentification

In [39]:
#redownloading files with redundant names
#these are files we need to reinspect due to their multiplicity
series_num_objs = available_data.loc[:, ['title', 'lastname', 'preferreddisplayname']].groupby(
    ['title', 'lastname']).apply(len)
series = series_num_objs[series_num_objs > 2]
series_groups = series.index
series_groups[0]
duplicates = None
for title, author in series_groups:
    dupe_partition = available_data.where(
        (available_data.title == title) & (available_data.lastname == author)).dropna(how='all')
    if duplicates is None:
        duplicates = dupe_partition
    else:
        duplicates = pd.concat([duplicates, dupe_partition])

In [42]:
duplicates = duplicates.reset_index(drop=False)

In [51]:
duplicates['old_fp'] = duplicates.directory + duplicates.subfolder + '/' + duplicates['title'].apply(lambda x: x.replace(' ','_').replace('/', '&')) + '_' + duplicates['lastname'].apply(lambda x: x.replace(' ', '_').replace('/', '&') + '.jpg')

In [57]:
#deleting old names (if not found thats a good thing)
for name in duplicates.old_fp.unique():
    try:
        os.remove(name)
    except FileNotFoundError:
        print('Not found: ', name)

Not found:  ../latinamerican-2-imagefolder-split/train/A_Brazilian_in_Florida_Batista.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Garden_Iturbide.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Garden_Toledo.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Garden_de_Ávila.jpg
Not found:  ../latinamerican-2-imagefolder-split/train/Head_Tamayo.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Optical_Box_Soto.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Untitled_Abularach.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Untitled_Gego.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Untitled_Merida.jpg
Not found:  ../latinamerican-2-imagefolder-split/test/Untitled_(Skull)_Cervantes.jpg


In [44]:
import shutil
import requests
def download_image(url, folder, file_name, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(file_name, "wb") as f:
            f.write(response.content)
        shutil.move(file_name, folder + file_name)
    else:
        print(response.status_code)
#Define HTTP Headers
headers = {
    "User-Agent": "Chrome/51.0.2704.103",
}
import time
for i in range(0, len(duplicates)):
    #Define URL of an image
    url = duplicates.expanded_url[i]
    # Define image file name
    file_name = duplicates.file_name[i]
    # Define location to move file
    folder = duplicates.directory[i] + duplicates.subfolder[i] + '/'
    if len(file_name[:-4]) > 250:
        num_remove = len(file_name[:-4]) - 250
        file_name = file_name[:-(4 + num_remove)] + '.jpg'
    # Download image
    #timer delay (15 seconds)
    time.sleep(15)
    download_image(url, folder, file_name, headers)

In [59]:
available_data

Unnamed: 0,expanded_url,title,forwarddisplayname,medium,sequence,width,height,maxpixels,depictstmsobjectid,assistivetext,...,isvirtual,portfolio,series,file_name,directory,file_downloaded,percent_fromCountry,image_fp,old_file_name,subfolder
0,https://api.nga.gov/iiif/95bc5221-79a1-49e1-b7...,Festival Andino,Nemesio Antunez,color lithograph,0.0,2963.0,4000.0,640.0,46741.0,,...,0.0,,,Festival_Andino_Antunez_46741.jpg,../latinamerican-2-imagefolder-split/,True,0.059322,../latinamerican-2-imagefolder-split/train/Fes...,Festival_Andino.jpg,train
1,https://api.nga.gov/iiif/f8a0c85d-7cfa-4e63-ae...,Chester Dale,Diego Rivera,oil on canvas,0.0,5297.0,4171.0,640.0,46537.0,,...,0.0,,,Chester_Dale_Rivera_46537.jpg,../latinamerican-2-imagefolder-split/,True,0.607345,../latinamerican-2-imagefolder-split/train/Che...,Chester_Dale.jpg,train
2,https://api.nga.gov/iiif/a2bc5dff-08a4-46d5-97...,Compostion XVI,Matta,"color etching, soft-ground, and aquatint with ...",0.0,2926.0,4000.0,640.0,48425.0,,...,0.0,"Suite from ""Come detta dentro vo significando""","Suite from ""Come detta dentro vo significando""",Compostion_XVI_Matta_48425.jpg,../latinamerican-2-imagefolder-split/,True,0.059322,../latinamerican-2-imagefolder-split/train/Com...,Compostion_XVI.jpg,train
3,https://api.nga.gov/iiif/c73c0136-d48d-4bea-89...,Compostion IV,Matta,"color etching, soft-ground, and aquatint",0.0,4000.0,3101.0,640.0,48413.0,,...,0.0,"Suite from ""Come detta dentro vo significando""","Suite from ""Come detta dentro vo significando""",Compostion_IV_Matta_48413.jpg,../latinamerican-2-imagefolder-split/,True,0.059322,../latinamerican-2-imagefolder-split/train/Com...,Compostion_IV.jpg,train
4,https://api.nga.gov/iiif/1f1918c9-9d30-4491-a9...,Compostion I,Matta,"color etching, soft-ground, and aquatint on Ja...",0.0,4000.0,3084.0,640.0,48410.0,,...,0.0,"Suite from ""Come detta dentro vo significando""","Suite from ""Come detta dentro vo significando""",Compostion_I_Matta_48410.jpg,../latinamerican-2-imagefolder-split/,True,0.059322,../latinamerican-2-imagefolder-split/test/Comp...,Compostion_I.jpg,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,https://api.nga.gov/iiif/552e99b7-0bb5-45e1-8b...,Gran Alarma Escandalosa,José Guadalupe Posada,metalcut and letterpress on pink newsprint,0.0,4949.0,7513.0,,152826.0,,...,0.0,,,Gran_Alarma_Escandalosa_Posada_152826.jpg,../latinamerican-2-imagefolder-split/,True,0.607345,../latinamerican-2-imagefolder-split/train/Gra...,Gran_Alarma_Escandalosa_Posada.jpg,train
349,https://api.nga.gov/iiif/83c83210-819e-4c27-8c...,Serafina,José Guadalupe Posada,metalcut and letterpress on yellow newsprint,0.0,2873.0,4000.0,,152827.0,,...,0.0,,,Serafina_Posada_152827.jpg,../latinamerican-2-imagefolder-split/,True,0.607345,../latinamerican-2-imagefolder-split/train/Ser...,Serafina_Posada.jpg,train
350,https://api.nga.gov/iiif/92410def-988a-4866-9c...,La Mujer de Cien Maridos - Como Alfileres Pren...,José Guadalupe Posada,metalcut on red newsprint,0.0,2726.0,4000.0,,152829.0,,...,0.0,,,La_Mujer_de_Cien_Maridos_-_Como_Alfileres_Pren...,../latinamerican-2-imagefolder-split/,True,0.607345,../latinamerican-2-imagefolder-split/test/La_M...,La_Mujer_de_Cien_Maridos_-_Como_Alfileres_Pren...,test
351,https://api.nga.gov/iiif/fcbe84cb-faa8-471d-b4...,Don Chepito,José Guadalupe Posada,relief etching on pink paper,0.0,4000.0,3169.0,,163508.0,,...,0.0,,,Don_Chepito_Posada_163508.jpg,../latinamerican-2-imagefolder-split/,True,0.607345,../latinamerican-2-imagefolder-split/train/Don...,Don_Chepito_Posada.jpg,train


In [63]:
available_data = available_data.reset_index(drop=False)

In [65]:
for i in range(0, len(available_data)):
    # Renaming filenames in overall metadata & in directory to recognize new naming pattern =, only if not already in to_download
    folder = available_data.directory[i] + available_data.subfolder[i] + '/'
    old_file_name = available_data.old_file_name[i]
    new_file_name = available_data.file_name[i]
    if len(new_file_name[:-4]) > 250:
        num_remove = len(new_file_name[:-4]) - 250
        new_file_name = new_file_name[:-(4 + num_remove)] + '.jpg'
    #renaming the file in the directory
    oldName = os.path.join(folder, old_file_name)
    newName = os.path.join(folder, new_file_name)
    if os.path.exists(oldName) and os.path.exists(newName):
        os.remove(oldName)
        available_data['file_name'][i] = new_file_name
        print('Old Name Deleted: ', old_file_name)
    elif os.path.exists(oldName) and not os.path.exists(newName):
        # update to new naming convention and delete oldName and update metadata
        os.rename(oldName, newName)
        #renaming the file in the metadata
        available_data['file_name'][i] = new_file_name
        print('New Name: ', new_file_name)

New Name:  Sed_non_Satiata_Delhez_119885.jpg
New Name:  No._9,_Nature_Morte_Espagnole_Rivera_121051.jpg
New Name:  Montserrat_Rivera_121052.jpg
New Name:  Prosa_de_la_Calavera_Cervantes_128543.jpg
New Name:  Arlequino_Rayo_130886.jpg
New Name:  Free_Youth,_Smog_and_Demagogue_Matta_133434.jpg
New Name:  Machines_(Máquinas)_Orozco_133435.jpg
Old Name Deleted:  Untitled_(Skull).jpg
New Name:  Cinco_Grabados_(Five_Prints)_Cervantes_133869.jpg
New Name:  Untitled_(Envelope_Torn)_Cervantes_133877.jpg
New Name:  Untitled_(Envelope_with_Leaves)_Cervantes_133878.jpg
New Name:  Untitled_(Envelope_Open)_Cervantes_133879.jpg
New Name:  Untitled_(Envelope_Painted)_Cervantes_133880.jpg
New Name:  Untitled_(Envelope_Composition)_Cervantes_133881.jpg
New Name:  Community_of_Franciscan_Monks,_San_Francisco_Monastery,_Cusco_Chambi_134860.jpg
New Name:  Parabola_Optica_(Optical_parable)_Álvarez_Bravo_136777.jpg
New Name:  Mental_Reactions_de_Zayas_137583.jpg
New Name:  Drowned_Shadow_Bonomi_145426.jpg
Ne

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [69]:
available_data.to_csv('../data_samples/results/whole_set_results/downloaded_LaArt.csv', index=False)