# Excel File and Batch Processing



In [None]:
from IPython.display import HTML
import os
from pathlib import Path

os.chdir(Path().absolute().parent) if Path().absolute().name == "Tutorials" else None

from db_tools import initialize_db, print_table
from excel_tools import construct_description, construct_title, construct_zenodo_metadata, excel_to_dataframe, get_mapped_entry
from file_validator import validate_file
from geo_utilities import retrieve_coordinates
from main_functions import create_record, publish_record, upload_files_into_deposition
from utilities import append_image_metadata_to_description, get_filetype, get_image_metadata, identify_dates_in_exif, load_config, printJSON, search_file, validate_zenodo_metadata, write_json

config_excel = load_config("Tutorials/Configs/excel_operations.yaml")
excel_path = "Tutorials/Excel/tutorial.xlsx"
excel_df = excel_to_dataframe(excel_path)
excel_df # view the loaded data

### Column Data Mapping

Now we will use the configuration to map the column headers to certain variables that will be used to create valid Zenodo metadata: 

In [None]:
print("-- Old Mapping:")
printJSON(config_excel["column_mapping"])

config_excel["column_mapping"]["file name"] = "FileName"
config_excel["column_mapping"]["Sublocation"] = "Location_0"
config_excel["column_mapping"]["location"] = "Location_1"
config_excel["column_mapping"]["province"] = "Location_2"
config_excel["column_mapping"]["country"] = "Location_3"
config_excel["column_mapping"]["header"] = "Titles"
config_excel["column_mapping"]["author, email address"] = "E-Mail"

print("\n-- New Mapping:")
printJSON(config_excel["column_mapping"])

If you change keys that are essential for the process of creating Zenodo Metadata, you should modify the `column_basevalues` in the configuration accordingly.

<br>Let's assume you wanted to map the following column headers:
- author -> Initiator
- copyright -> RightsHolder
- description -> Content
- keywords -> Tags

In [None]:
config_excel["column_mapping"]["author"] = "Initiator"
config_excel["column_mapping"]["copyright"] = "RightsHolder"
config_excel["column_mapping"]["description"] = "Content"
config_excel["column_mapping"]["keywords"] = "Tags"

config_excel["column_basevalues"]["author"] = "Initiator"
config_excel["column_basevalues"]["copyright"] = "RightsHolder"
config_excel["column_basevalues"]["description"] = "Content"
config_excel["column_basevalues"]["keywords"] = "Tags"
printJSON(config_excel["column_basevalues"])

Now, we will use the function `get_mapped_entry`:
<br>Based on the configuration, it processes strings in data fields and returns the mapped row data. With the second (`int`) argument, you can define the desired row:

In [None]:
config_excel["settings"]["split_keywords"] = True
config_excel["settings"]["split_description"] = False

mapped_row_data = get_mapped_entry(excel_df, 0, config_excel)
printJSON(mapped_row_data)

Let's check the Value of "Content", which is a string containing three levels of information separated by periods.
<br>If we would like to split it into a list of three strings, we can define it like this:

In [None]:
config_excel["settings"]["split_description"] = True
config_excel["misc"]["split_characters"]["description"] = ["."]

mapped_row_data = get_mapped_entry(excel_df, 0, config_excel)
printJSON(mapped_row_data)

Exceptions which shall be ignored, like 'Mrs.', 'Str.' or 'ex.', can be defined in:
<br>`config_excel['misc']['split_exceptions']['...']`.

### Mapping of Zenodo Metadata

Now that we have learned how to map and process Data coming from the Excel File, we should now start to map it to the Zenodo Metadata. We can use the configuration for that:

In [None]:
printJSON(config_excel["zenodo_metadata"])

These are the default Zenodo Metadata values, while the other values will be added dynamically, e.g. locations, dates, title, description etc.
<br>It's being handled automatically by the function `construct_zenodo_metadata`, but there are lots of possibilities to get the desired output.

#### Construct Title
Let's start with the **Title Constructor**:

In [None]:
data = mapped_row_data

# Change the constructor values, according to the keys of the mapped_row_data:
config_excel["title_constructor"]["pos_0"] = "FileName"
config_excel["title_constructor"]["pos_1"] = "Content" # try changing to Location_0
# config_excel["title_constructor"]["pos_2"] = "Location_2" # try uncommenting this line

# Set a Separator, e.g. a hyphen (separating blank spaces will be added automatically between positions):
config_excel["title_constructor"]["separator"] = "-"

# Print Configuration
print("Configuration of Title Constructor:")
printJSON(config_excel["title_constructor"])

# Print Constructed Title
title = construct_title(data, config_excel)
print(f"\nResulting Title: {title}")

#### Construct Description

Let's continue with the Description Constructor, which is a highly parameterizable module of this Zenodo Toolbox:

In [None]:
print("Default Constructor:")
printJSON(config_excel["description_constructor"])

print("\nCurrent Mapped Data:")
printJSON(data)

As you can see, we will have to adapt the default configuration variables with the desired ones, as they are diverging.
<br>Here, we can also adapt whatever we like, e.g. remove or add lines, change the HTML style etc.:

In [None]:
# Adapt to your Patterns
config_excel["description_constructor"] = {
    "line_0": "<u>FileName</u>: {FileName} <br>",
    "line_1": "<u>Sublocation</u>: {Location_0} <br>",
    "line_2": "<u>Location</u>: {Location_1} <br>",
    "line_3": "<u>Province</u>: {Location_2} <br>",
    "line_4": "<u>Country</u>: {country} <br>",
    "line_5": "<u>Titles</u>: {Titles} <br>",
    "line_6": "<u>Content</u>: {Content} <br>",
    "line_7": "<u>Tags</u>: {Tags} <br><br>",
    "line_8": "<u>Author</u>: {Initiator} <br>",
    "line_9": "<u>Copyright</u>: {RightsHolder} <br>",
    "line_10": "<u>Rightholder's Mail</u>: {author_email} <br>"
}

description = construct_description(data, custom_config=config_excel)
print("Resulting HTML Description:")
display(HTML(description))

#### File Validation and Path Finder

In our Excel File, we have filenames without any extension listed in the first column, but if we want to upload them into Zenodo Records, we must make sure that the listed file is available and valid.
<br>We will do the following:
1. Search for the listed file in a specified directory, based on its filename and allowed extensions.
2. Validate the file based on its filetype.
3. (optional) Uncomment `filename = "test_image_corrupt"` and check the results!

In [None]:
# set example filename from Excel Dataframe
row = 0
filename = excel_df["file name"][0]
# filename = "test_image_corrupt" # try this to see the file validator in action!

# configure images directory to search in and allowed file extensions
config_excel["paths"]["input"]["images"] = "Tutorials/Images"
config_excel["misc"]["image_extensions"] = [".png", ".jpg", ".jpeg"] # (don't worry about case sensitivity)

# search for file in directory specified above
filepath = search_file(filename, config=config_excel)
if filepath:
    print(f"File identified: '{filepath}'")
    print(f"Filetype of '{Path(filepath).name}': '{get_filetype(filepath)}'")
    
# validate image file
file_validation_errors = validate_file(filepath)
if file_validation_errors:
    print("File Validation errors:")
    for error in file_validation_errors:
        print(f"- {error}")
else:
    print("- No File Validation Errors found! -")

#### EXIF Data Integration & Processing

We can integrate our EXIF data into the description, and/or the general image metadata. Try to change `add_exif` to `False` and `True`:

In [None]:
# initialize filepath and get EXIF metadata
filepath = "Tutorials/Images/test_image_exif.jpg"
image_metadata = get_image_metadata(filepath)

# enable adding image metadata and EXIF data to description
config_excel["settings"]["add_image_metadata_to_description"] = True
add_exif = config_excel["settings"]["add_exif_to_description"] = True # set this to False to exclude EXIF from description

# construct description and append metadata and EXIF data
description = construct_description(data, custom_config=config_excel)
description = append_image_metadata_to_description(description, image_metadata, add_exif)
display(HTML(description))

#### Date Information Extraction

With the EXIF data, we can additionally extract Date Information, like the date of capturing, in various formats:

In [None]:
dates_data = identify_dates_in_exif(image_metadata["exif"]) if image_metadata.get("exif", "") else {}
printJSON(dates_data)

#### Location Information Extraction

In addition, by utilizing the metadata and description, we can query exact or approximate coordinates using Nominatim/OSM:

<small>

<u>Note</u>: This feature is not optimized yet.

</small>

In [None]:
# get mapped data for EXIF example photo
filename_photo = "test_image_exif"
idx_photo = excel_df[excel_df["file name"] == filename_photo].index[0]
data = get_mapped_entry(excel_df, idx_photo, config_excel)

# configure geolocator
config_excel["geolocator"]["user_agent"] = "GeolocatorTutorial/0.1 (your@email.com)"

# retrieve coordinates data, formatted as Zenodo Metadata
coordinates = retrieve_coordinates(data, config_excel)
print("\nZenodo Coordinates Object:")
printJSON(coordinates)

A backoff mechanism is implemented here, so whenever an exact location is not found, it will try to find the higher order location using the less detailed description.

### Batch Processing: From Excel Data to Zenodo Record


Since we have explored advanced operations, we can finally adapt some in our batch processing routine, where we will do the following:

1) Connect to the Local Database
2) Construct Zenodo Metadata
3) Create new Zenodo Record per row
4) Validate files and upload into deposition
5) Publish Record

The function `construct_zenodo_metadata` will handle many of the operations we have done before, e.g. the construction of the title and description. Your configuration is crucial here.

In [None]:
# connect to local database
db_connection = initialize_db(load_config("Tutorials/Configs/db_config.yaml"))

# process excel data
len_rows = len(excel_df)
for ct_row in range(len_rows):
    print(f"Processing Row {ct_row+1}/{len_rows} ...")
    # 1) Get mapped Excel Data
    data = get_mapped_entry(excel_df, ct_row, config_excel)

    # 2) Find File specified in mapped Excel Data
    filename = data["FileName"]
    filepath = search_file(filename, config=config_excel)
    assert filepath, f"File not found: {filename}"
    print(f"Filename: {filename}")
    print(f"Filepath: {filepath}")

    # 3) Validate File
    file_validation_errors = validate_file(filepath)
    assert not file_validation_errors, "File Validation errors:\n" + "\n".join(f"- {error}" for error in file_validation_errors)
    print("File successfully validated.")

    # 4) Extract Images Metadata and EXIF Data
    image_metadata = get_image_metadata(filepath)
    print("Image Metadata:")
    printJSON(image_metadata)

    # 5) Extract Dates from EXIF
    dates_data = {}
    if image_metadata:
        dates_data = identify_dates_in_exif(image_metadata["exif"]) if image_metadata.get("exif", "") else {}
        if dates_data:
            print(f"Dates Data:")
            printJSON(dates_data)
        else:
            print("No Dates identified.")

    # 6) Query Coordinates from Location Information
    locations_data = retrieve_coordinates(data, config_excel) if config_excel["settings"]["retrieve_coordinates"] else []
    if locations_data:
        print(f"Locations Data:")
        printJSON(locations_data)
    else:
        print("No Locations identified.")

    # 7) Configure what you like & Construct Zenodo Metadata
    config_excel["title_constructor"].pop("pos_1", None)
    zenodo_metadata = construct_zenodo_metadata(row_data=data, image_metadata=image_metadata, 
                                                dates_data=dates_data, locations_data=locations_data, 
                                                files_data={}, custom_config=config_excel)
    printJSON(zenodo_metadata)

    # 8) Validate constructed Zenodo Metadata
    zenodo_metadata_validation_errors = validate_zenodo_metadata(zenodo_metadata)
    assert not zenodo_metadata_validation_errors, "Zenodo Metadata Validation errors:\n" + "\n".join(f"- {error}" for error in zenodo_metadata_validation_errors)
    print("Zenodo Metadata successfully validated.")
            
    # 9) Create Record
    create_msg, create_data = create_record(zenodo_metadata, db_connection)
    assert create_msg["success"], f"Failed to create Record: {create_msg['text']}"
    concept_recid = create_data["conceptrecid"]
    print("Operation:")
    print_table(db_connection, "operations", create_data["conceptrecid"])

    # 10) Write JSON if EXIF
    filepaths = [filepath]
    
    upload_exif_json = config_excel["settings"]["upload_exif_json"] = True
    if image_metadata["exif"] and upload_exif_json:
        exif_json_path = f'{config_excel["paths"]["output"]["exif_json"]}/{concept_recid}/{Path(filepath).stem}.json'
        if write_json(image_metadata["exif"], exif_json_path):
            filepaths.append(exif_json_path)
            
    # 11) Upload Files
    fileupload_msg, fileupload_data = upload_files_into_deposition(create_data, filepaths, False, db_connection)
    assert fileupload_msg["success"], f"Failed to upload Files: {fileupload_msg['text']}"
    print("Operation:")
    print_table(db_connection, "operations", create_data["conceptrecid"])

    # 12) Set additional_data before publishing for Database entry and Filetables
    additional_data = {
        "type": "image",
        "subset": "excel-tutorial",
        "filedata": fileupload_data
    }

    # 13) Publish Record
    publish_msg, publish_data = publish_record(create_data, db_connection, additional_data)
    assert publish_msg["success"], f"Failed to publish Record with ConceptRecID {create_data['conceptrecid']}"
    print("Operation:")
    print_table(db_connection, "operations", publish_data["conceptrecid"])
    print("Record:")
    print_table(db_connection, "records", publish_data["conceptrecid"])

# Close database connection
db_connection.close()

## Conclusion

In your [uploads section](https://sandbox.zenodo.org/me/uploads), you should now see those five records. If you open the Record called "test_image_exif", you should additionally see a JSON containing the EXIF data in the files deposition, which should be in the description encoded as HTML as well. If you click on 'Preview' next to the JSON file, you will see it displayed with Unicode escape sequences, such as "\u00a9" representing the copyright symbol.

If you want to add filetables, you must create the record first in order to receive the RecordID and assign the filelinks according to that (`{filename: directlink}`), which you can provide to the function `construct_zenodo_metadata` as a dictionary using the argument `files_data`. Keep in mind that the configuration settings/add_filetables_to_description must be set to `true`.

<br>This was just an example; every step can be tweaked or expanded, e.g. by adding **resized images**, **masking persons on images**, **rendering thumbnails for 3D models** or additionally **uploading the EXIF as JSON**.