In [1]:
import requests
import json
import os
import time
import re
from urllib.parse import urljoin
import math

# NEW: Configuration flags for easy switching of download components
DOWNLOAD_MANIFEST = True
DOWNLOAD_METADATA = True
DOWNLOAD_IMAGES = False

In [2]:
def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '', filename).strip()

def extract_metadata(manifest_data):
    metadata = {}
    if 'metadata' in manifest_data:
        for item in manifest_data['metadata']:
            key = item.get('label', '')
            value = item.get('value', '')
            metadata[key] = value
    return metadata

# UPDATED: Modified to respect configuration flags
def download_manifest(manifest_url, json_folder, title):
    if not DOWNLOAD_MANIFEST and not DOWNLOAD_METADATA:
        print("Manifest and metadata download are disabled.")
        return None

    try:
        response = requests.get(manifest_url)
        response.raise_for_status()
        manifest_data = response.json()
        
        if DOWNLOAD_MANIFEST:
            sanitized_title = sanitize_filename(title)
            manifest_filename = f"{sanitized_title[:100]}_manifest.json"
            manifest_filepath = os.path.join(json_folder, manifest_filename)
            with open(manifest_filepath, 'w', encoding='utf-8') as f:
                json.dump(manifest_data, f, indent=2)
            print(f"Downloaded manifest: {manifest_filepath}")
        
        if DOWNLOAD_METADATA:
            metadata = extract_metadata(manifest_data)
            metadata_filename = f"{sanitized_title[:100]}_metadata.json"
            metadata_filepath = os.path.join(json_folder, metadata_filename)
            with open(metadata_filepath, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, indent=2)
            print(f"Saved metadata: {metadata_filepath}")
        
        return manifest_data
    except requests.RequestException as e:
        print(f"Error processing manifest {manifest_url}: {e}")
        return None

In [3]:
def download_file(url, filepath):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True
    except requests.RequestException as e:
        print(f"Error downloading file {url}: {e}")
        return False

def extract_image_urls(manifest_data):
    image_urls = []
    if 'sequences' in manifest_data:
        for sequence in manifest_data['sequences']:
            if 'canvases' in sequence:
                for canvas in sequence['canvases']:
                    if 'images' in canvas:
                        for image in canvas['images']:
                            if 'resource' in image and 'service' in image['resource']:
                                service = image['resource']['service']
                                if '@id' in service:
                                    base_url = service['@id']
                                    image_url = f"{base_url}/full/max/0/default.jpg"
                                    image_urls.append(image_url)
    return image_urls    
# UPDATED: Modified to respect configuration flag
def download_images(manifest_data, image_folder):
    if not DOWNLOAD_IMAGES:
        print("Image download is disabled.")
        return

    image_urls = extract_image_urls(manifest_data)
    for i, image_url in enumerate(image_urls):
        image_filename = f"image_{i+1}.jpg"
        image_filepath = os.path.join(image_folder, image_filename)
        if download_file(image_url, image_filepath):
            print(f"Downloaded image: {image_filepath}")
        else:
            print(f"Failed to download image: {image_url}")

In [4]:
def get_total_objects(api_url,params):
    try:
        response = requests.get(api_url,params)
        response.raise_for_status()
        data = response.json()
        return data.get('info', {}).get('record_count', 0)
    except requests.RequestException as e:
        print(f"Error fetching total object count: {e}")
        return 0
    
def get_total_pages(api_url,params): #AMIR
    try:
        response = requests.get(api_url,params)
        response.raise_for_status()
        data = response.json()
        return data.get('info', {}).get('pages', 0)
    except requests.RequestException as e:
        print(f"Error fetching number of pages: {e}")
        return 0

In [11]:
# UPDATED: Modified to use new folder structure and respect configuration flags
def process_objects(api_url,params, output_dir, total_objects, total_pages):
    objects_processed = 0
    manifests_downloaded = 0
    page = 1
    page_size = params["page_size"]
    
    
    while page<=total_pages :
        try:
            params["page"] = page
            print(f"Processing page {page} of {total_pages}")            
            response = requests.get(api_url,params)
            response.raise_for_status()
            data = response.json()
            
            if 'records' not in data: #AMIR is record a standard key? if not we need to have a list of keys meaning record
                print(f"Unexpected API response structure: {json.dumps(data, indent=2)}")
                return manifests_downloaded

            records = data['records']
            print(f"Found {len(records)} objects on this page")
            print(f" records size {len(records)}")
            
            for record in records:
                objects_processed += 1
                system_number = record.get('systemNumber', 'Unknown') #AMIR  what if multiple objects have no system Number?
                                                                    #is systemNumber an standard key? if not ...
                title = record.get('_primaryTitle', 'Untitled')
                print(f"\nProcessing object {objects_processed} of {total_objects}: {system_number}")
                print(f"Title: {title}")
                
                manifest_url = record.get('_images', {}).get('_iiif_presentation_url')
                
                if manifest_url:
                    print(f"IIIF Manifest URL: {manifest_url}")
                    
                    # NEW: Create folders for this object
                    object_folder = os.path.join(output_dir, system_number)
                    json_folder = os.path.join(object_folder, f"{system_number}-json")
                    image_folder = os.path.join(object_folder, f"{system_number}-Image")
                    
                    if DOWNLOAD_MANIFEST or DOWNLOAD_METADATA:
                        os.makedirs(json_folder, exist_ok=True)
                    if DOWNLOAD_IMAGES:
                        os.makedirs(image_folder, exist_ok=True)
                    
                    manifest_data = download_manifest(manifest_url, json_folder, title)
                    if manifest_data:
                        if DOWNLOAD_IMAGES:
                            download_images(manifest_data, image_folder)
                        manifests_downloaded += 1
                else:
                    print("No IIIF manifest available for this object")
                
                time.sleep(1)  # Be nice to the server
            
            print(f"test {api_url}")
            page += 1
            
        except requests.RequestException as e:
            print(f"Error processing objects page {api_url}: {e}")
            break
    
    return manifests_downloaded

In [12]:
# UPDATED: Added user prompt and configuration display
def main():
    page_size =5
    api_url = "https://api.vam.ac.uk/v2/objects/search"
    params = {"q": "iran","images": 1,"page_size": page_size,"iiif": "true","page": 1}
    
    output_dir = r"C:\Users\mazou\Sarvistan\Victoria_Albert_v2"
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    total_objects = get_total_objects(api_url,params)    
    total_pages = get_total_pages(api_url,params)
    
    print(f"Total number of Iran-related objects with images: {total_objects}")
    print(f"page size: {page_size}") #AMIR
    print(f"Download configuration:")
    print(f"  Manifests: {'Enabled' if DOWNLOAD_MANIFEST else 'Disabled'}")
    print(f"  Metadata: {'Enabled' if DOWNLOAD_METADATA else 'Disabled'}")
    print(f"  Images: {'Enabled' if DOWNLOAD_IMAGES else 'Disabled'}")
    
    # NEW: Ask user if they want to continue
    user_input = input("Do you want to continue with the download? (yes/no): ").lower()
    if user_input != 'yes' and user_input != 'y':
        print("Download cancelled by user.")
        return
    
    if total_objects > 0:
        total_downloaded = process_objects(api_url,params, output_dir, total_objects ,total_pages=2 )
        print(f"\nFinished processing objects.")
        print(f"Downloaded {total_downloaded} IIIF manifests and their associated data from Victoria and Albert Museum.")
    else:
        print("No objects found matching the criteria.")

if __name__ == "__main__":
    main()

Total number of Iran-related objects with images: 5666
page size: 5
Download configuration:
  Manifests: Enabled
  Metadata: Enabled
  Images: Disabled
Do you want to continue with the download? (yes/no): y
Processing page 1 of 2
Found 5 objects on this page
 records size 5

Processing object 1 of 60: O405551
Title: Shah Isma'il of Iran
IIIF Manifest URL: https://iiif.vam.ac.uk/collections/O405551/manifest.json
Downloaded manifest: C:\Users\mazou\Sarvistan\Victoria_Albert_v2\O405551\O405551-json\Shah Isma'il of Iran_manifest.json
Saved metadata: C:\Users\mazou\Sarvistan\Victoria_Albert_v2\O405551\O405551-json\Shah Isma'il of Iran_metadata.json

Processing object 2 of 60: O1304867
Title: Iran, Untitled
No IIIF manifest available for this object

Processing object 3 of 60: O1304870
Title: Iran, Untitled
No IIIF manifest available for this object

Processing object 4 of 60: O1304869
Title: Iran, Untitled
No IIIF manifest available for this object

Processing object 5 of 60: O1304868
Title