In [1]:
import requests
import pandas as pd
from nypl_token import Token
import os


# sets root path for directory creation and collection save
ROOT_PATH = os.path.dirname(os.path.abspath("__file__"))

# set API token, collection UUID, keyword for directory, and which image version you want to download, see below
token = Token.nypl_tok

# collection uuid
uuid = '2ffdbff0-c6f4-012f-6c96-58d385a7bc34'


# collection label for output folder name
collection_label = "test_2"

# image size args to pass into image_select. for my project I am only grabbing the 760 (x) size. will need further logic to handle missing sizes
# b - .jpeg center cropped thumbnail (100x100 pixels)
# f - .jpeg (140 pixels tall with variable width)
# t - .gif (150 pixels on the long side)
# r - .jpeg (300 pixels on the long side)
# w - .jpeg (760 pixels on the long side)
# q - .jpeg (1600 pixels on the long side)
# v - .jpeg (2560 pixels on the long side)
# g - .jpeg original dimensions

# NOTE: all but the high res files will be jpeg, if you plan to resize pull images in a larger format that what you will need
image_select = 'w'

# choice to filter items labeled as text includes title pages, table of contents, etc. depends on labeling from digital collections
# option 'y' to filter text, 'n' for no.
filter_text_items = 'y'

filter_text_items = filter_text_items.lower()

if not os.path.exists(os.path.join(ROOT_PATH, collection_label)):
    os.mkdir(os.path.join(ROOT_PATH, collection_label))

# first call pulls down all subcoollection uuid's for parent and parses out relevant json
response = requests.get('http://api.repo.nypl.org/api/v1/collections/' + uuid + '?&per_page=500',
                        headers={'Authorization': 'Token token=' + token})

data = response.json()

df = pd.json_normalize(data)

item_list = []
item_list_2 = []

sub_collection = df["nyplAPI.response.numSubCollections"].item()

sub_collection = int(sub_collection)

count = 0
count_2 = 0

# first condition handles collections broken into sub-collections
if sub_collection > 0:

    df_sub = df.explode('nyplAPI.response.collection', ignore_index=True)

    df_sub = pd.json_normalize(df_sub['nyplAPI.response.collection'])

    sub_col = df_sub['uuid'].tolist()


    for col in sub_col:
        response2 = requests.get('http://api.repo.nypl.org/api/v1/items/' + col + '?&per_page=500',
                                 headers={'Authorization': 'Token token=' + token})

        item_data = response2.json()

        item_list.append(item_data)


    df_item = pd.json_normalize(item_list)

    df_item = df_item.explode('nyplAPI.response.capture', ignore_index=True)

    df_item = pd.json_normalize(df_item['nyplAPI.response.capture'])

    if filter_text_items == 'y':
        # filters out images of text like table of contents and title pages
        df_item = df_item[df_item['typeOfResource'] != 'text']

    else:
        pass


    if 'imageLinks.imageLink' in df_item:

        df_item_filt = df_item[['uuid', 'imageLinks.imageLink']].copy()

        df_item_filt = df_item_filt.explode('imageLinks.imageLink', ignore_index=True)
        # removes all but selected image type to download and creates list of download links

        df_item_filt = df_item_filt[df_item_filt['imageLinks.imageLink'].str.contains('&t=' + image_select + '&download')]

        img_list = df_item_filt['imageLinks.imageLink'].tolist()


        for img in img_list:

            response3 = requests.get(img, headers={'Authorization': 'Token token=' + token})

            count += 1
            # set image file type to match image_select type
            if response3.status_code == 200:

                with open(os.path.join(ROOT_PATH, collection_label) + "\image" + str(count) + ".jpg",
                          'wb') as f: f.write(response3.content)

    else:
        print('no image links for collection')


# handles collections not broken into sub-collections
else:

    df_item_only = df.explode('nyplAPI.response.item', ignore_index=True)

    item_list_only = df_item_only["nyplAPI.response.item"].tolist()

    df_item_list = pd.json_normalize(item_list_only)

    item_find_only = df_item_list['uuid'].tolist()


    for item in item_find_only:

        response_find_item = requests.get('http://api.repo.nypl.org/api/v1/items/' + item + '?&per_page=500',
                                          headers={'Authorization': 'Token token=' + token})

        item_data_only = response_find_item.json()

        item_list_2.append(item_data_only)


    df_item_2 = pd.json_normalize(item_list_2)

    df_item_2 = df_item_2.explode('nyplAPI.response.capture', ignore_index=True)

    df_item_2 = pd.json_normalize(df_item_2['nyplAPI.response.capture'])

    if filter_text_items == 'y':
        # filters out items labeled as text (title pages, table of contents, etc.)
        df_item_2 = df_item_2[df_item_2['typeOfResource'] != 'text']

    else:
        pass


    if 'imageLinks.imageLink' in df_item_2.columns:

        df_item_filt_2 = df_item_2[['uuid', 'imageLinks.imageLink']].copy()

        df_item_filt_2 = df_item_filt_2.explode('imageLinks.imageLink', ignore_index=True)

        df_item_filt_2 = df_item_filt_2[
            df_item_filt_2['imageLinks.imageLink'].str.contains('&t=' + image_select + '&download')]

        img_list_item = df_item_filt_2['imageLinks.imageLink'].tolist()


        for img in img_list_item:

            response3 = requests.get(img, headers={'Authorization': 'Token token=' + token})

            count_2 += 1

            # set image file type to match image_select type
            if response3.status_code == 200:

                with open(os.path.join(ROOT_PATH, collection_label) + "\image" + str(count_2) + ".jpg",
                          'wb') as f: f.write(response3.content)

    else:

        print('no image links for collection')
