In [1]:
import json
import requests
from PIL import Image
import os

In [7]:
iiif_url = 'https://www.e-codices.unifr.ch/metadata/iiif/csg-0267/manifest.json'
output_folder = os.getcwd()

In [None]:
def get_manifest(iiif_url) -> tuple:

    """
    Retrieves the IIIF manifest from a given URL and extracts relevant metadata.

    Args:
        iiif_url (str): The URL of the IIIF manifest.

    Returns:
        tuple: A tuple containing:
            - manifest (dict): The JSON data of the IIIF manifest.
            - iiif_version (int): The IIIF version extracted from the manifest context.
            - library_domain (str): The domain of the library hosting the manifest.

    Raises:
        Exception: If an error occurs while fetching or processing the manifest.
    """

    # Send an HTTP GET request to fetch the IIIF manifest
    response = requests.get(iiif_url)

    # Check if the manifest was successfully loaded 
    try:
        if response.status_code == 200:
            # Parse the JSON response
            manifest = response.json()
            print(f"Manifest {iiif_url} loaded with success.")

            # Extract the IIIF version from the '@context' field
            iiif_version = manifest['@context'].split('/')[-2]

            # Extract the library domain from the URL
            library_domain = iiif_url.split('/')[2]
            
            # Return the extracted data
            return manifest, int(iiif_version), library_domain
            
        else:
            print(f"Error loading the manifest: {response.status_code}")
    except Exception as e:
        print(f"An error occurred while loading the manifest: {e}")
        raise  # Re-raise the exception for upstream handling

In [None]:
def images_data_iiif_3(manifest:str, library_domain:str) -> tuple:
    """
    Extracts image information from an IIIF v3 manifest.

    Args:
        manifest (dict): The JSON data of the IIIF manifest.
        library_domain (str): The domain of the library hosting the manifest.

    Returns:
        tuple: A tuple containing:
            - ms_name (str): The manuscript's name extracted from the manifest.
            - images_info (list of dict): A list containing image metadata such as:
                - canvasNum (int): The index of the canvas.
                - imageLabel (str): The label of the image.
                - urlImage (str): The download URL of the image.
                - imageFormat (str): The format of the image (e.g., jpg, png).
                - imageWidthAsDeclared (int): The declared width of the image.
                - imageHeightAsDeclared (int): The declared height of the image.

    Raises:
        Exception: If an error occurs during processing, an error message is printed, 
                  and an empty list is returned.
    """
    try:
        # Extract the manuscript name from the manifest
        ms_name = manifest['label']['en'][0]

        # Extract the canvases from the manifest
        canvases = manifest['items']

        # Create a list to store image metadata
        images_info = []

        # Loop through the canvases to extract image information
        for idx, canvas in enumerate(canvases):
            # Extract the image label if available, otherwise assign a default value
            image_label = canvas['label']['en'][0] if 'label' in canvas and 'en' in canvas['label'] else "Unknow label"
            
            # Retrieve declared image dimensions
            image_width_declared = canvas.get('width', 0)
            image_height_declared = canvas.get('height', 0)

            # Extract the image URL from the manifest
            image_url = canvas['items'][0]['items'][0]['body']['id']

            if library_domain == 'digi.vatlib.it':
                # Special case for 'digi.vatlib.it' domain
                download_url = image_url + '/full/max/0/default.jpg'
            else:
                # Modify the URL to ensure the maximum available size
                download_url = image_url.replace(
                    image_url.split('/')[-3], 'max') # Use ‘max’ to be sure of retrieving the largest authorised size and avoid restrictions due to the maximum threshold (max_pixels).
            
            # Extract the image format from the URL
            image_format = download_url.split('.')[-1]
        
            # Append the extracted image information to the list
            images_info.append({
                'canvasNum': idx,
                'imageLabel': image_label,
                'urlImage': download_url,
                'imageFormat' : image_format,
                'imageWidthAsDeclared': image_width_declared,
                'imageHeightAsDeclared': image_height_declared
            })

        print(f"Found {len(images_info)} images in the manifest.")
        
        return ms_name, images_info
    
    except Exception as e:
        print(f"An error occured : {e}")
        return None, []

images_data_iiif_3(iiif_url)

In [None]:
def images_data_iiif_2(manifest, library_domain):
    """
    Extracts image information from an IIIF v2 manifest.

    Args:
        manifest (dict): The JSON data of the IIIF manifest.
        library_domain (str): The domain of the library hosting the manifest.

    Returns:
        list of dict: A list containing image metadata, where each dictionary includes:
            - canvasNum (int): The index of the canvas.
            - imageLabel (str): The label of the image.
            - urlImage (str): The URL of the image.
            - imageFormat (str): The format of the image (e.g., jpg, png).
            - imageWidthAsDeclared (int): The declared width of the image.
            - imageHeightAsDeclared (int): The declared height of the image.

    Raises:
        Exception: If an error occurs during processing, an error message is printed, 
                  and an empty list is returned.
    """
    try:
        # Extract canvases from the manifest
        canvases = manifest['sequences'][0]['canvases']

        # Create a list to store image metadata
        images_info = []

        # Loop through the canvases to extract image information
        for idx, canvas in enumerate(canvases):
            canvas_num = idx # Canvas index in the manifest
            image_label = canvas['label'] # Extract image label

            # Extract image resource details
            image_resource = canvas['images'][0]['resource']
            image_url = image_resource['@id']
            image_format = image_url.split('.')[-1] # Extract image format from URL
            image_width_declared = image_resource.get('width', 0)
            image_height_declared = image_resource.get('height', 0)
            
            # Special handling for 'digi.vatlib.it' domain
            if library_domain == 'digi.vatlib.it':
                image_url = canvas['images'][0]['resource']['@id'] + '/full/full/0/default.jpg'

            # Append the extracted image information to the list    
            images_info.append({
                'canvasNum': canvas_num,
                'imageLabel': image_label,
                'urlImage': image_url,
                'imageFormat' : image_format,
                'imageWidthAsDeclared': image_width_declared,
                'imageHeightAsDeclared': image_height_declared
            })

        print(f"Found {len(images_info)} images in the manifest.")
        
        return images_info

    except Exception as e:
        print(f"An error occurred: {e}")
        return []



In [None]:
def download_from_iiif(iiif_url, output_folder):

    # Créer un dossier pour stocker les images téléchargées
    
    folder_name, images_info = images_data_iiif_3(iiif_url)
    folder_path = os.path.join(output_folder, folder_name)
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Parcourir la liste des images et les télécharger
    for idx, image in enumerate(images_info):
        url = image['urlImage']
        image_filename = f"image_{idx+1}.jpg"
        image_path = os.path.join(folder_path, image_filename)
        
        try:
            response = requests.get(url)
            html_code = response.status_code
            print(f"Téléchargement de {image_filename} - Code HTTP : {html_code}")
            
            if response.status_code == 200:
                # Enregistrer l'image sur le disque
                with open(image_path, 'wb') as file:
                    file.write(response.content)
                image['imageFileName'] = image_filename
                image['folderPath'] = folder_path
                image['htmlCode'] = html_code
            else:
                print(f"Erreur lors du téléchargement de l'image : {html_code}")
                image['htmlCode'] = html_code
        except Exception as e:
            print(f"Exception lors du téléchargement de l'image : {e}")
            image['htmlCode'] = 'Exception'

In [None]:
manifest, iiif_version, library_domain = get_manifest(iiif_url)
print(iiif_version, library_domain)



In [None]:
download_from_iiif(iiif_url, output_folder)

In [None]:
# Parcourir les images pour récupérer les caractéristiques et les enregistrer
for image in images_info:
    image_filename = image.get('imageFileName')
    
    # Vérifier si l'image est valide (nom de fichier et code HTML = 200)
    if image_filename and image.get('htmlCode') == 200:
        image_path = os.path.join(image['folderPath'], image_filename)
        txt_filename = os.path.splitext(image_filename)[0] + '.txt'
        txt_path = os.path.join(image['folderPath'], txt_filename)
        
        # Ouvrir l'image avec PIL et récupérer les dimensions téléchargées (size retourne un tuple (width, height) )
        with Image.open(image_path) as img:
            image_width_downloaded, image_height_downloaded = img.size
            image['imageWidthAsDownloaded'] = image_width_downloaded
            image['imageHeightAsDownloaded'] = image_height_downloaded
        
        # Liste des clés à écrire dans le fichier .txt
        data_to_write = [
            'imageFileName', 'canvasId', 'urlImage', 'folderPath', 'imageLabel',
            'imageWidthAsDeclared', 'imageHeightAsDeclared', 'htmlCode',
            'imageWidthAsDownloaded', 'imageHeightAsDownloaded'
        ]
        
        # Enregistrer les caractéristiques dans le fichier .txt
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            for key in data_to_write:
                value = image.get(key)
                # Si la valeur est nulle (None ou chaîne vide), remplacer par "Non défini"
                if not value:
                    value = 'Non défini'
                txt_file.write(f"{key}: {value}\n")
        
        print(f"Caractéristiques enregistrées dans {txt_filename}")
    else:
        print(f"Image non téléchargée ou erreur lors du téléchargement pour {image.get('imageLabel', 'Inconnu')}")

In [None]:
url = 'https://digi.vatlib.it/iiifimage/MSS_Ross.982/Ross.982_0016_fa_0002v.jp2'
download_url = url + '/full/full/0/default.jpg'

download_url

'https://digi.vatlib.it/iiifimage/MSS_Ross.982/Ross.982_0016_fa_0002v.jp2/full/full/0/default.jpg'