# Imports and Setup
This section imports necessary libraries, handles warnings, and sets up the environment for the script.

In [None]:
import copy
import json
import logging
import os
import warnings
import zipfile

import torch
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

warnings.filterwarnings("ignore")

# Load Environment Variables and Initialize Neo4j Graph
This section loads environment variables from .env file and initializes a connection to the Neo4j database using the provided credentials.

In [None]:
load_dotenv(".env", override=True)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE") or "neo4j"

VECTOR_INDEX_NAME = "texts_from_records"
VECTOR_NODE_LABEL = "recordWithText"
VECTOR_SOURCE_PROPERTY = ["text"]
VECTOR_EMBEDDING_PROPERTY = "textEmbedding"

kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    enhanced_schema=False,
    refresh_schema=False,
)

constraints = [
    ("unique_item", "item"),
    ("unique_fileUnit", "fileUnit"),
    ("unique_series", "series"),
    ("unique_recordGroup", "recordGroup"),
    ("unique_collection", "collection"),
    ("unique_geographicPlaceName", "geographicPlaceName"),
    ("unique_organization", "organization"),
    ("unique_person", "person"),
    ("unique_specificRecordsTypes", "specificRecordsTypes"),
    ("unique_topicalSubject", "topicalSubject"),
]

# Dataset Paths
This section defines the paths to the ZIP archives containing the dataset files.

In [None]:
zip_paths = [
    r"H:\nac_export_descriptions_2025-04-08.zip",
    r"H:\nac_export_authorities_2025-04-08.zip",
]

# Save Folder List to File (Commented Out)
This section contains a function to save the list of folder names from ZIP archives to a text file, which can be used to track processing progress. It's disabled by default to prevent accidental execution.

In [None]:
# def save_folders_to_file(folders, filename="remaining_folders.txt"):
#     with open(filename, "w", encoding="utf-8") as f:
#         for folder in folders:
#             f.write(folder + "\n")


# folders_saved = []

# for zip_path in zip_paths:
#     with zipfile.ZipFile(zip_path, "r") as archive:
#         folders_saved.extend([i[:-1] for i in archive.namelist() if i.endswith("/")])

# save_folders_to_file(folders_saved)

# Database Cleanup (Commented Out)
This section contains commented-out queries to delete all nodes and drop existing constraints and indexes. It's disabled by default to prevent accidental data loss.

In [None]:
# kg.query(
#     """CALL apoc.periodic.iterate(
#   "MATCH (n) RETURN n",
#   "DETACH DELETE n",
#   {batchSize:1000})"""
# )

# for name, label in constraints:
#     cypher = f"DROP CONSTRAINT {name} IF EXISTS"
#     kg.query(cypher)

# kg.query(f"DROP INDEX {VECTOR_INDEX_NAME} IF EXISTS")

# Load Embedding Model
This section sets up the directory for model caching, detects the available device (CUDA or CPU), and loads a pre-trained embedding model and tokenizer from Hugging Face.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

embedder_model_name = #embedder name
embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_model_name)
embedder_model = AutoModel.from_pretrained(embedder_model_name).to(device).eval()

# Configure Logging
This section configures logging to write progress and error messages to a file for tracking the script's execution.

In [None]:
logging.basicConfig(
    filename="logs.txt",
    filemode="a",
    format="%(asctime)s - %(message)s",
    level=logging.INFO,
    encoding="utf-8",
)
logger = logging.getLogger("progress_logger")

# Helper Functions for Data Processing
This section defines several helper functions to process ancestors, unpack data from JSON, handle transcriptions, split text into chunks, show progress, and extract information from description and authority files.

In [None]:
def process_ancestors(file):
    ancestors_naId = []
    for i in reversed(file["record"]["ancestors"]):
        ancestors_naId.append(i["naId"])
    return ancestors_naId


def unpack(data, keys, default="N/A"):
    if isinstance(keys, str):
        return data.get(keys, default)
    else:
        data = data.get(keys[0], default)
        if isinstance(data, str):
            return data
        elif isinstance(data, list):
            data_list = []
            for i in data:
                data_list.append(i.get(keys[1], default))
            return data_list
        elif isinstance(data, dict):
            data = data.get(keys[1], default)
            return data


def process_transcription(file):
    if file.get("record_transcription") is not None:
        record_transcription = ""
        for contribution in file["record_transcription"]:
            record_transcription += contribution["contribution"]
        return record_transcription
    return "N/A"


def split_into_chunks(file_dict, full_text, file_name, line_num):
    chunks = []
    num_chunks = 0
    if full_text == "":
        full_text = "N/A"

    if full_text is "N/A":
        chunk_dict = file_dict.copy()
        chunk_dict["text"] = full_text
        chunk_dict["chunkSeqId"] = 0
        chunk_dict["chunkId"] = (
            f"{file_name[file_name.find('/') + 1 : file_name.find('.')]}_{line_num}-chunk0000"
        )
        chunks.append(chunk_dict)
        num_chunks = 1
    else:
        text_chunks = text_splitter.split_text(full_text)
        num_chunks = len(text_chunks)
        for chunkSeqId, chunk in enumerate(text_chunks):
            chunk_dict = copy.deepcopy(file_dict)
            chunk_dict["text"] = chunk
            chunk_dict["chunkSeqId"] = chunkSeqId
            chunk_dict["chunkId"] = (
                f"{file_name[file_name.find('/') + 1 : file_name.find('.')]}_{line_num}-chunk{chunkSeqId:04d}"
            )
            chunks.append(chunk_dict)

    return chunks, num_chunks


def show_progress(file_name, line_num, num_chunks):
    folder_name = file_name.split("/")[0]
    short_file_name = file_name.split("/")[1] if "/" in file_name else file_name

    print(
        f"Folder: {folder_name}. File: {short_file_name}. Line: {line_num} - processed. Number of chunks: {num_chunks}"
    )


def get_description_info(file_name):
    with archive.open(file_name) as f:
        for line_num, line in enumerate(f, 1):
            if not line.strip():
                continue
            file = json.loads(line)
            file_dict = {}
            file_dict["file_name"] = file_name
            file_dict["line_num"] = line_num

            if file["record"].get("ancestors") is not None:
                file_dict["ancestors"] = process_ancestors(file)

            schema = descriptions_schema[file["record"]["levelOfDescription"]]
            for k, v in schema:
                file_dict[k] = unpack(file["record"], v)

            if file_dict["levelOfDescription"] in ["item", "fileUnit"]:
                file_dict["record_transcription"] = process_transcription(file)

                file_dict["extractedText_digitalObjects"] = "".join(
                    [i for i in file_dict["extractedText_digitalObjects"] if i != "N/A"]
                )

                longest_text = max(
                    file_dict["extractedText_digitalObjects"],
                    file_dict["scopeAndContentNote"],
                    file_dict["record_transcription"],
                    key=len,
                )

                full_text = longest_text

                del longest_text

            else:
                full_text = file_dict["scopeAndContentNote"]
                del file_dict["scopeAndContentNote"]

            chunks, num_chunks = split_into_chunks(
                file_dict, full_text, file_name, line_num
            )
            result.extend(chunks)

            # show_progress(file_name, line_num, num_chunks)

        return result


def get_authority_info(file_name):
    with archive.open(file_name) as f:
        for line_num, line in enumerate(f, 1):
            if not line.strip():
                continue
            file = json.loads(line)
            file_dict = {}
            file_dict["file_name"] = file_name
            file_dict["line_num"] = line_num

            schema = authorities_schema[file["record"]["authorityType"]]
            for k, v in schema:
                file_dict[k] = unpack(file["record"], v)

            if file_dict["authorityType"] in [
                "geographicPlaceName",
                "specificRecordsType",
                "topicalSubject",
            ]:
                full_text = file_dict["scopeNote"]
                del file_dict["scopeNote"]

            elif file_dict["authorityType"] == "organization":
                full_text = file_dict["administrativeHistoryNote"]
                del file_dict["administrativeHistoryNote"]

            else:
                full_text = file_dict["biographicalNote"]
                del file_dict["biographicalNote"]

            chunks, num_chunks = split_into_chunks(
                file_dict, full_text, file_name, line_num
            )
            result.extend(chunks)

            # show_progress(file_name, line_num, num_chunks)

        return result

# Initialize Text Splitter
This section initializes a RecursiveCharacterTextSplitter from Langchain to split long texts into manageable chunks for embedding and storage.

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", "?", "!", " ", ""],
    keep_separator=True,
)

# Define Schemas for Descriptions and Authorities
This section defines schemas (dictionaries) that map property names to paths in the JSON data for different levels of archival descriptions (item, fileUnit, series, recordGroup, collection) and authority records such as geographic places, organizations, persons, specific record types, and topical subjects.

In [None]:
base_schema_description = {
    "beginCongress": "beginCongress",
    "logicalDate_coverageEndDate": ["coverageEndDate", "logicalDate"],
    "logicalDate_coverageStartDate": ["coverageStartDate", "logicalDate"],
    "groupName_dataControlGroup": ["dataControlGroup", "groupName"],
    "dateNote": "dateNote",
    "endCongress": "endCongress",
    "generalNotes": "generalNotes",
    "generalRecordsTypes": "generalRecordsTypes",
    "languages": "languages",
    "levelOfDescription": "levelOfDescription",
    "localIdentifier": "localIdentifier",
    "naId": "naId",
    "partyDesignation": "partyDesignation",
    "recordType": "recordType",
    "scaleNote": "scaleNote",
    "scopeAndContentNote": "scopeAndContentNote",
    "title": "title",
}

schema_item = {
    **base_schema_description,
    "accessionNumbers": "accessionNumbers",
    "description_accessRestriction": ["accessRestriction", "description"],
    "note_accessRestriction": ["accessRestriction", "note"],
    "status_accessRestriction": ["accessRestriction", "status"],
    "audiovisual": "audiovisual",
    "authorityType_contributors": ["contributors", "authorityType"],
    "contributorType_contributors": ["contributors", "contributorType"],
    "heading_contributors": ["contributors", "heading"],
    "naId_contributors": ["contributors", "naId"],
    "custodialHistoryNote": "custodialHistoryNote",
    "extractedText_digitalObjects": ["digitalObjects", "extractedText"],
    "objectDescription_digitalObjects": ["digitalObjects", "objectDescription"],
    "objectType_digitalObjects": ["digitalObjects", "objectType"],
    "objectUrl_digitalObjects": ["digitalObjects", "objectUrl"],
    "internalTransferNumbers": "internalTransferNumbers",
    "identifier_microformPublications": ["microformPublications", "identifier"],
    "note_microformPublications": ["microformPublications", "note"],
    "title_microformPublications": ["microformPublications", "title"],
    "description_onlineResources": ["onlineResources", "description"],
    "note_onlineResources": ["onlineResources", "note"],
    "url_onlineResources": ["onlineResources", "url"],
    "otherTitles": "otherTitles",
    "logicalDate_productionDates": ["productionDates", "logicalDate"],
    "recordsCenterTransferNumbers": "recordsCenterTransferNumbers",
    "authorityType_subjects": ["subjects", "authorityType"],
    "heading_subjects": ["subjects", "heading"],
    "naId_subjects": ["subjects", "naId"],
    "subtitle": "subtitle",
    "transferNote": "transferNote",
    "note_useRestriction": ["useRestriction", "note"],
    "specificUseRestrictions_useRestriction": [
        "useRestriction",
        "specificUseRestrictions",
    ],
    "status_useRestriction": ["useRestriction", "status"],
    "note_variantControlNumbers": ["variantControlNumbers", "note"],
    "number_variantControlNumbers": ["variantControlNumbers", "number"],
    "type_variantControlNumbers": ["variantControlNumbers", "type"],
    "copyStatus_physicalOccurrences": ["physicalOccurrences", "copyStatus"],
    "extent_physicalOccurrences": ["physicalOccurrences", "extent"],
    "physicalOccurrenceNote_physicalOccurrences": [
        "physicalOccurrences",
        "physicalOccurrenceNote",
    ],
}

schema_fileUnit = {
    **base_schema_description,
    "accessionNumbers": "accessionNumbers",
    "description_accessRestriction": ["accessRestriction", "description"],
    "note_accessRestriction": ["accessRestriction", "note"],
    "status_accessRestriction": ["accessRestriction", "status"],
    "arrangement": "arrangement",
    "audiovisual": "audiovisual",
    "authorityType_contributors": ["contributors", "authorityType"],
    "contributorType_contributors": ["contributors", "contributorType"],
    "heading_contributors": ["contributors", "heading"],
    "naId_contributors": ["contributors", "naId"],
    "custodialHistoryNote": "custodialHistoryNote",
    "extractedText_digitalObjects": ["digitalObjects", "extractedText"],
    "objectDescription_digitalObjects": ["digitalObjects", "objectDescription"],
    "objectType_digitalObjects": ["digitalObjects", "objectType"],
    "objectUrl_digitalObjects": ["digitalObjects", "objectUrl"],
    "editStatus": "editStatus",
    "fileFormat_findingAids": ["findingAids", "fileFormat"],
    "findingAidtype_findingAids": ["findingAids", "findingAidtype"],
    "note_findingAids": ["findingAids", "note"],
    "source_findingAids": ["findingAids", "source"],
    "url_findingAids": ["findingAids", "url"],
    "urlNote_findingAids": ["findingAids", "urlNote"],
    "urlDescription_findingAids": ["findingAids", "urlDescription"],
    "internalTransferNumbers": "internalTransferNumbers",
    "itemCount": "itemCount",
    "identifier_microformPublications": ["microformPublications", "identifier"],
    "note_microformPublications": ["microformPublications", "note"],
    "title_microformPublications": ["microformPublications", "title"],
    "description_onlineResources": ["onlineResources", "description"],
    "note_onlineResources": ["onlineResources", "note"],
    "url_onlineResources": ["onlineResources", "url"],
    "otherTitles": "otherTitles",
    "copyStatus_physicalOccurrences": ["physicalOccurrences", "copyStatus"],
    "extent_physicalOccurrences": ["physicalOccurrences", "extent"],
    "physicalOccurrenceNote_physicalOccurrences": [
        "physicalOccurrences",
        "physicalOccurrenceNote",
    ],
    "recordsCenterTransferNumbers": "recordsCenterTransferNumbers",
    "soundType": "soundType",
    "authorityType_subjects": ["subjects", "authorityType"],
    "heading_subjects": ["subjects", "heading"],
    "naId_subjects": ["subjects", "naId"],
    "subtitle": "subtitle",
    "transferNote": "transferNote",
    "note_useRestriction": ["useRestriction", "note"],
    "specificUseRestrictions_useRestriction": [
        "useRestriction",
        "specificUseRestrictions",
    ],
    "status_useRestriction": ["useRestriction", "status"],
    "note_variantControlNumbers": ["variantControlNumbers", "note"],
    "number_variantControlNumbers": ["variantControlNumbers", "number"],
    "type_variantControlNumbers": ["variantControlNumbers", "type"],
}

schema_series = {
    **base_schema_description,
    "accessionNumbers": "accessionNumbers",
    "description_accessRestriction": ["accessRestriction", "description"],
    "note_accessRestriction": ["accessRestriction", "note"],
    "status_accessRestriction": ["accessRestriction", "status"],
    "arrangement": "arrangement",
    "audiovisual": "audiovisual",
    "authorityType_contributors": ["contributors", "authorityType"],
    "contributorType_contributors": ["contributors", "contributorType"],
    "heading_contributors": ["contributors", "heading"],
    "naId_contributors": ["contributors", "naId"],
    "authorityType_creators": ["creators", "authorityType"],
    "creatorType_creators": ["creators", "creatorType"],
    "heading_creators": ["creators", "heading"],
    "naId_creators": ["creators", "naId"],
    "custodialHistoryNote": "custodialHistoryNote",
    "dispositionAuthorityNumbers": "dispositionAuthorityNumbers",
    "editStatus": "editStatus",
    "fileUnitCount": "fileUnitCount",
    "fileFormat_findingAids": ["findingAids", "fileFormat"],
    "findingAidtype_findingAids": ["findingAids", "findingAidtype"],
    "note_findingAids": ["findingAids", "note"],
    "source_findingAids": ["findingAids", "source"],
    "url_findingAids": ["findingAids", "url"],
    "urlNote_findingAids": ["findingAids", "urlNote"],
    "functionAndUse": "functionAndUse",
    "logicalDate_inclusiveEndDate": ["inclusiveEndDate", "logicalDate"],
    "logicalDate_inclusiveStartDate": ["inclusiveStartDate", "logicalDate"],
    "internalTransferNumbers": "internalTransferNumbers",
    "itemCount": "itemCount",
    "identifier_microformPublications": ["microformPublications", "identifier"],
    "note_microformPublications": ["microformPublications", "note"],
    "title_microformPublications": ["microformPublications", "title"],
    "numberingNote": "numberingNote",
    "note_onlineResources": ["onlineResources", "note"],
    "url_onlineResources": ["onlineResources", "url"],
    "otherTitles": "otherTitles",
    "copyStatus_physicalOccurrences": ["physicalOccurrences", "copyStatus"],
    "extent_physicalOccurrences": ["physicalOccurrences", "extent"],
    "physicalOccurrenceNote_physicalOccurrences": [
        "physicalOccurrences",
        "physicalOccurrenceNote",
    ],
    "recordsCenterTransferNumbers": "recordsCenterTransferNumbers",
    "soundType": "soundType",
    "authorityType_subjects": ["subjects", "authorityType"],
    "heading_subjects": ["subjects", "heading"],
    "naId_subjects": ["subjects", "naId"],
    "transferNote": "transferNote",
    "note_useRestriction": ["useRestriction", "note"],
    "specificUseRestrictions_useRestriction": [
        "useRestriction",
        "specificUseRestrictions",
    ],
    "status_useRestriction": ["useRestriction", "status"],
    "note_variantControlNumbers": ["variantControlNumbers", "note"],
    "number_variantControlNumbers": ["variantControlNumbers", "number"],
    "type_variantControlNumbers": ["variantControlNumbers", "type"],
}

schema_recordGroup = {
    **base_schema_description,
    "fileFormat_findingAids": ["findingAids", "fileFormat"],
    "findingAidtype_findingAids": ["findingAids", "findingAidtype"],
    "note_findingAids": ["findingAids", "note"],
    "source_findingAids": ["findingAids", "source"],
    "url_findingAids": ["findingAids", "url"],
    "urlNote_findingAids": ["findingAids", "urlNote"],
    "logicalDate_inclusiveEndDate": ["inclusiveEndDate", "logicalDate"],
    "logicalDate_inclusiveStartDate": ["inclusiveStartDate", "logicalDate"],
    "recordGroupNumber": "recordGroupNumber",
    "address1_referenceUnits": ["referenceUnits", "address1"],
    "address2_referenceUnits": ["referenceUnits", "address2"],
    "city_referenceUnits": ["referenceUnits", "city"],
    "email_referenceUnits": ["referenceUnits", "email"],
    "fax_referenceUnits": ["referenceUnits", "fax"],
    "mailCode_referenceUnits": ["referenceUnits", "mailCode"],
    "name_referenceUnits": ["referenceUnits", "name"],
    "phone_referenceUnits": ["referenceUnits", "phone"],
    "postalCode_referenceUnits": ["referenceUnits", "postalCode"],
    "state_referenceUnits": ["referenceUnits", "state"],
    "seriesCount": "seriesCount",
}

schema_collection = {
    **base_schema_description,
    "collectionIdentifier": "collectionIdentifier",
    "authorityType_donors": ["donors", "authorityType"],
    "heading_donors": ["donors", "heading"],
    "naId_donors": ["donors", "naId"],
    "fileFormat_findingAids": ["findingAids", "fileFormat"],
    "findingAidtype_findingAids": ["findingAids", "findingAidtype"],
    "note_findingAids": ["findingAids", "note"],
    "source_findingAids": ["findingAids", "source"],
    "url_findingAids": ["findingAids", "url"],
    "urlNote_findingAids": ["findingAids", "urlNote"],
    "logicalDate_inclusiveEndDate": ["inclusiveEndDate", "logicalDate"],
    "logicalDate_inclusiveStartDate": ["inclusiveStartDate", "logicalDate"],
    "address1_referenceUnits": ["referenceUnits", "address1"],
    "address2_referenceUnits": ["referenceUnits", "address2"],
    "city_referenceUnits": ["referenceUnits", "city"],
    "email_referenceUnits": ["referenceUnits", "email"],
    "fax_referenceUnits": ["referenceUnits", "fax"],
    "mailCode_referenceUnits": ["referenceUnits", "mailCode"],
    "name_referenceUnits": ["referenceUnits", "name"],
    "phone_referenceUnits": ["referenceUnits", "phone"],
    "postalCode_referenceUnits": ["referenceUnits", "postalCode"],
    "state_referenceUnits": ["referenceUnits", "state"],
    "seriesCount": "seriesCount",
    "note_variantControlNumbers": ["variantControlNumbers", "note"],
    "number_variantControlNumbers": ["variantControlNumbers", "number"],
    "type_variantControlNumbers": ["variantControlNumbers", "type"],
}

descriptions_schema = {
    "item": schema_item.items(),
    "fileUnit": schema_fileUnit.items(),
    "series": schema_series.items(),
    "recordGroup": schema_recordGroup.items(),
    "collection": schema_collection.items(),
}

base_schema_authority = {
    "authorityType": "authorityType",
    "heading": "heading",
    "naId": "naId",
    "recordType": "recordType",
    "sourceNotes": "sourceNotes",
}

schema_geographic_place_name = {
    **base_schema_authority,
    "description_broaderTerms": ["broaderTerms", "description"],
    "naId_broaderTerms": ["broaderTerms", "naId"],
    "heading_broaderTerms": ["broaderTerms", "heading"],
    "coordinates": "coordinates",
    "importRecordControlNumber": "importRecordControlNumber",
    "geographicPlaceName_linkCounts": ["linkCounts", "geographicPlaceName"],
    "jurisdiction_linkCounts": ["linkCounts", "jurisdiction"],
    "organization_linkCounts": ["linkCounts", "organization"],
    "subject_linkCounts": ["linkCounts", "subject"],
    "totalDescription_linkCounts": ["linkCounts", "totalDescription"],
    "naId_narrowerTerms": ["narrowerTerms", "naId"],
    "heading_narrowerTerms": ["narrowerTerms", "heading"],
    "naId_relatedTerms": ["relatedTerms", "naId"],
    "heading_relatedTerms": ["relatedTerms", "heading"],
    "recordSource": "recordSource",
    "scopeNote": "scopeNote",
    "useFor": "useFor",
}

schema_organization = {
    **base_schema_authority,
    "administrativeHistoryNote": "administrativeHistoryNote",
    "naId_jurisdictions": ["jurisdictions", "naId"],
    "name_jurisdictions": ["jurisdictions", "name"],
    "contributor_linkCounts": ["linkCounts", "contributor"],
    "creator_linkCounts": ["linkCounts", "creator"],
    "donor_linkCounts": ["linkCounts", "donor"],
    "subject_linkCounts": ["linkCounts", "subject"],
    "totalDescription_linkCounts": ["linkCounts", "totalDescription"],
    "contributorTypes_organizationNames": ["organizationNames", "contributorTypes"],
    "creatorTypes_organizationNames": ["organizationNames", "creatorTypes"],
    "heading_organizationNames": ["organizationNames", "heading"],
    "naId_organizationNames": ["organizationNames", "naId"],
    "name_organizationNames": ["organizationNames", "name"],
    "recordSource_organizationNames": ["organizationNames", "recordSource"],
    "variantOrganizationNames": "variantOrganizationNames",
    "authorityType_personalReferences": ["personalReferences", "authorityType"],
    "heading_personalReferences": ["personalReferences", "heading"],
    "naId_personalReferences": ["personalReferences", "naId"],
    "programAreas": "programAreas",
}

schema_person = {
    **base_schema_authority,
    "biographicalNote": "biographicalNote",
    "logicalDate_birthDate": ["birthDate", "logicalDate"],
    "logicalDate_deathDate": ["deathDate", "logicalDate"],
    "fullerFormOfName": "fullerFormOfName",
    "importRecordControlNumber": "importRecordControlNumber",
    "contributor_linkCounts": ["linkCounts", "contributor"],
    "creator_linkCounts": ["linkCounts", "creator"],
    "donor_linkCounts": ["linkCounts", "donor"],
    "subject_linkCounts": ["linkCounts", "subject"],
    "totalDescription_linkCounts": ["linkCounts", "totalDescription"],
    "name": "name",
    "numerator": "numerator",
    "authorityType_organizationalReferences": [
        "organizationalReferences",
        "authorityType",
    ],
    "heading_organizationalReferences": ["organizationalReferences", "heading"],
    "naId_organizationalReferences": ["organizationalReferences", "naId"],
    "personalTitle": "personalTitle",
    "recordSource": "recordSource",
    "contributor_role": ["role", "contributor"],
    "creator_role": ["role", "creator"],
    "donor_role": ["role", "donor"],
    "reference_role": ["role", "reference"],
    "fullerFormOfName_variantPersonNames": ["variantPersonNames", "fullerFormOfName"],
    "heading_variantPersonNames": ["variantPersonNames", "heading"],
    "name_variantPersonNames": ["variantPersonNames", "name"],
    "numerator_variantPersonNames": ["variantPersonNames", "numerator"],
    "personalTitle_variantPersonNames": ["variantPersonNames", "personalTitle"],
}

schema_specific_records_types = {
    **base_schema_authority,
    "naId_broaderTerms": ["broaderTerms", "naId"],
    "name_broaderTerms": ["broaderTerms", "name"],
    "importRecordControlNumber": "importRecordControlNumber",
    "specificRecordsType_linkCounts": ["linkCounts", "specificRecordsType"],
    "subject_linkCounts": ["linkCounts", "subject"],
    "totalDescription_linkCounts": ["linkCounts", "totalDescription"],
    "naId_narrowerTerms": ["narrowerTerms", "naId"],
    "heading_narrowerTerms": ["narrowerTerms", "heading"],
    "recordSource": "recordSource",
    "naId_relatedTerms": ["relatedTerms", "naId"],
    "heading_relatedTerms": ["relatedTerms", "heading"],
    "scopeNote": "scopeNote",
    "useFor": "useFor",
}

schema_topical_subject = {
    **base_schema_authority,
    "naId_broaderTerms": ["broaderTerms", "naId"],
    "name_broaderTerms": ["broaderTerms", "name"],
    "subject_linkCounts": ["linkCounts", "subject"],
    "topicalSubject_linkCounts": ["linkCounts", "topicalSubject"],
    "totalDescription_linkCounts": ["linkCounts", "totalDescription"],
    "naId_narrowerTerms": ["narrowerTerms", "naId"],
    "heading_narrowerTerms": ["narrowerTerms", "heading"],
    "naId_relatedTerms": ["relatedTerms", "naId"],
    "heading_relatedTerms": ["relatedTerms", "heading"],
    "recordSource": "recordSource",
    "scopeNote": "scopeNote",
    "useFor": "useFor",
}

authorities_schema = {
    "geographicPlaceName": schema_geographic_place_name.items(),
    "organization": schema_organization.items(),
    "person": schema_person.items(),
    "specificRecordsType": schema_specific_records_types.items(),
    "topicalSubject": schema_topical_subject.items(),
}

# Process ZIP Archives
This section defines paths to ZIP files, initializes a result list, and processes each ZIP archive by extracting folders and files, then calling the appropriate info extraction function (description or authority) for each JSONL file.

In [None]:
def get_next_folder(filename="remaining_folders.txt"):
    try:
        with open(filename, "r", encoding="utf-8") as f:
            lines = f.readlines()

        if not lines:
            return None

        current_folder = lines[0].strip()
        remaining_folders = lines[1:]

        with open(filename, "w", encoding="utf-8") as f:
            f.writelines(remaining_folders)

        return current_folder
    except FileNotFoundError:
        return None


result = []
folders = []

while True:
    try:
        n = int(input("How many folders need to be processed?\n"))
        logger.info(f"{n} folder(s) will be processed")
        print(f"{n} folder(s) will be processed")
        break
    except ValueError:
        print("Error: Enter an integer")

for _ in range(n):
    next_folder = get_next_folder()
    if next_folder is not None:
        folders.append(next_folder)

for zip_path in zip_paths:
    logger.info("=" * 40)
    logger.info(f"Starting archive processing: {zip_path}")
    logger.info("=" * 40)
    with zipfile.ZipFile(zip_path, "r") as archive:
        for folder in tqdm(folders, desc="Folders"):
            logger.info("=" * 40)
            logger.info(f"Starting folder processing: {folder}")
            logger.info("=" * 40)
            counter = 0
            files_in_folder = [
                name
                for name in archive.namelist()
                if name.startswith(folder) and name.endswith("jsonl")
            ]
            length = len(files_in_folder)
            for file_name in tqdm(files_in_folder, desc=f"{folder}", leave=False):
                counter += 1
                if "authorities" in zip_path:
                    get_authority_info(file_name)
                else:
                    get_description_info(file_name)
                msg = f"{folder}: {counter}/{length}"
                logger.info(msg)

# Define Merge Queries for Nodes
This section defines Cypher queries to merge (create or update) nodes in Neo4j for each type of description or authority, setting properties from the processed data.

In [None]:
properties_by_type = {
    "item": [
        "file_name",
        "line_num",
        "ancestors",
        "accessionNumbers",
        "description_accessRestriction",
        "note_accessRestriction",
        "status_accessRestriction",
        "audiovisual",
        "beginCongress",
        "authorityType_contributors",
        "contributorType_contributors",
        "heading_contributors",
        "naId_contributors",
        "logicalDate_coverageEndDate",
        "logicalDate_coverageStartDate",
        "custodialHistoryNote",
        "groupName_dataControlGroup",
        "dateNote",
        "levelOfDescription",
        "objectDescription_digitalObjects",
        "objectType_digitalObjects",
        "objectUrl_digitalObjects",
        "endCongress",
        "generalNotes",
        "generalRecordsTypes",
        "internalTransferNumbers",
        "languages",
        "localIdentifier",
        "identifier_microformPublications",
        "note_microformPublications",
        "title_microformPublications",
        "naId",
        "description_onlineResources",
        "note_onlineResources",
        "url_onlineResources",
        "otherTitles",
        "partyDesignation",
        "logicalDate_productionDates",
        "recordsCenterTransferNumbers",
        "recordType",
        "scaleNote",
        "scopeAndContentNote",
        "authorityType_subjects",
        "heading_subjects",
        "naId_subjects",
        "subtitle",
        "title",
        "transferNote",
        "note_useRestriction",
        "specificUseRestrictions_useRestriction",
        "status_useRestriction",
        "note_variantControlNumbers",
        "number_variantControlNumbers",
        "type_variantControlNumbers",
        "copyStatus_physicalOccurrences",
        "extent_physicalOccurrences",
        "physicalOccurrenceNote_physicalOccurrences",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "fileUnit": [
        "file_name",
        "line_num",
        "ancestors",
        "accessionNumbers",
        "description_accessRestriction",
        "note_accessRestriction",
        "status_accessRestriction",
        "arrangement",
        "audiovisual",
        "beginCongress",
        "authorityType_contributors",
        "contributorType_contributors",
        "heading_contributors",
        "naId_contributors",
        "logicalDate_coverageEndDate",
        "logicalDate_coverageStartDate",
        "custodialHistoryNote",
        "groupName_dataControlGroup",
        "dateNote",
        "objectDescription_digitalObjects",
        "objectType_digitalObjects",
        "objectUrl_digitalObjects",
        "editStatus",
        "fileFormat_findingAids",
        "findingAidtype_findingAids",
        "note_findingAids",
        "source_findingAids",
        "url_findingAids",
        "urlNote_findingAids",
        "urlDescription_findingAids",
        "endCongress",
        "generalNotes",
        "generalRecordsTypes",
        "internalTransferNumbers",
        "itemCount",
        "languages",
        "levelOfDescription",
        "localIdentifier",
        "naId",
        "identifier_microformPublications",
        "note_microformPublications",
        "title_microformPublications",
        "description_onlineResources",
        "note_onlineResources",
        "url_onlineResources",
        "otherTitles",
        "partyDesignation",
        "copyStatus_physicalOccurrences",
        "extent_physicalOccurrences",
        "physicalOccurrenceNote_physicalOccurrences",
        "recordsCenterTransferNumbers",
        "recordType",
        "scaleNote",
        "scopeAndContentNote",
        "soundType",
        "authorityType_subjects",
        "heading_subjects",
        "naId_subjects",
        "subtitle",
        "title",
        "transferNote",
        "note_useRestriction",
        "specificUseRestrictions_useRestriction",
        "status_useRestriction",
        "note_variantControlNumbers",
        "number_variantControlNumbers",
        "type_variantControlNumbers",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "series": [
        "file_name",
        "line_num",
        "ancestors",
        "accessionNumbers",
        "description_accessRestriction",
        "note_accessRestriction",
        "status_accessRestriction",
        "arrangement",
        "audiovisual",
        "beginCongress",
        "authorityType_contributors",
        "contributorType_contributors",
        "heading_contributors",
        "naId_contributors",
        "logicalDate_coverageEndDate",
        "logicalDate_coverageStartDate",
        "authorityType_creators",
        "creatorType_creators",
        "heading_creators",
        "naId_creators",
        "custodialHistoryNote",
        "groupName_dataControlGroup",
        "dateNote",
        "dispositionAuthorityNumbers",
        "editStatus",
        "endCongress",
        "fileUnitCount",
        "fileFormat_findingAids",
        "findingAidtype_findingAids",
        "note_findingAids",
        "source_findingAids",
        "url_findingAids",
        "urlNote_findingAids",
        "functionAndUse",
        "generalNotes",
        "generalRecordsTypes",
        "logicalDate_inclusiveEndDate",
        "logicalDate_inclusiveStartDate",
        "internalTransferNumbers",
        "itemCount",
        "languages",
        "levelOfDescription",
        "localIdentifier",
        "identifier_microformPublications",
        "note_microformPublications",
        "title_microformPublications",
        "naId",
        "numberingNote",
        "note_onlineResources",
        "url_onlineResources",
        "otherTitles",
        "partyDesignation",
        "copyStatus_physicalOccurrences",
        "extent_physicalOccurrences",
        "physicalOccurrenceNote_physicalOccurrences",
        "recordsCenterTransferNumbers",
        "recordType",
        "scaleNote",
        "soundType",
        "authorityType_subjects",
        "heading_subjects",
        "naId_subjects",
        "title",
        "transferNote",
        "note_useRestriction",
        "specificUseRestrictions_useRestriction",
        "status_useRestriction",
        "note_variantControlNumbers",
        "number_variantControlNumbers",
        "type_variantControlNumbers",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "recordGroup": [
        "file_name",
        "line_num",
        "beginCongress",
        "logicalDate_coverageEndDate",
        "logicalDate_coverageStartDate",
        "groupName_dataControlGroup",
        "dateNote",
        "endCongress",
        "fileFormat_findingAids",
        "findingAidtype_findingAids",
        "note_findingAids",
        "source_findingAids",
        "url_findingAids",
        "urlNote_findingAids",
        "logicalDate_inclusiveEndDate",
        "logicalDate_inclusiveStartDate",
        "levelOfDescription",
        "naId",
        "partyDesignation",
        "recordGroupNumber",
        "recordType",
        "address1_referenceUnits",
        "address2_referenceUnits",
        "city_referenceUnits",
        "email_referenceUnits",
        "fax_referenceUnits",
        "mailCode_referenceUnits",
        "name_referenceUnits",
        "phone_referenceUnits",
        "postalCode_referenceUnits",
        "state_referenceUnits",
        "seriesCount",
        "title",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "collection": [
        "file_name",
        "line_num",
        "collectionIdentifier",
        "logicalDate_coverageEndDate",
        "logicalDate_coverageStartDate",
        "groupName_dataControlGroup",
        "dateNote",
        "authorityType_donors",
        "heading_donors",
        "naId_donors",
        "fileFormat_findingAids",
        "findingAidtype_findingAids",
        "note_findingAids",
        "source_findingAids",
        "url_findingAids",
        "urlNote_findingAids",
        "logicalDate_inclusiveEndDate",
        "logicalDate_inclusiveStartDate",
        "levelOfDescription",
        "naId",
        "recordType",
        "address1_referenceUnits",
        "address2_referenceUnits",
        "city_referenceUnits",
        "email_referenceUnits",
        "fax_referenceUnits",
        "mailCode_referenceUnits",
        "name_referenceUnits",
        "phone_referenceUnits",
        "postalCode_referenceUnits",
        "state_referenceUnits",
        "seriesCount",
        "title",
        "note_variantControlNumbers",
        "number_variantControlNumbers",
        "type_variantControlNumbers",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "geographicPlaceName": [
        "file_name",
        "line_num",
        "authorityType",
        "description_broaderTerms",
        "naId_broaderTerms",
        "heading_broaderTerms",
        "coordinates",
        "heading",
        "importRecordControlNumber",
        "geographicPlaceName_linkCounts",
        "jurisdiction_linkCounts",
        "organization_linkCounts",
        "subject_linkCounts",
        "totalDescription_linkCounts",
        "naId",
        "naId_narrowerTerms",
        "heading_narrowerTerms",
        "naId_relatedTerms",
        "heading_relatedTerms",
        "recordSource",
        "recordType",
        "scopeNote",
        "sourceNotes",
        "useFor",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "organization": [
        "file_name",
        "line_num",
        "administrativeHistoryNote",
        "authorityType",
        "naId_jurisdictions",
        "name_jurisdictions",
        "heading",
        "contributor_linkCounts",
        "creator_linkCounts",
        "donor_linkCounts",
        "subject_linkCounts",
        "totalDescription_linkCounts",
        "naId",
        "contributorTypes_organizationNames",
        "creatorTypes_organizationNames",
        "heading_organizationNames",
        "naId_organizationNames",
        "name_organizationNames",
        "recordSource_organizationNames",
        "variantOrganizationNames",
        "authorityType_personalReferences",
        "heading_personalReferences",
        "naId_personalReferences",
        "programAreas",
        "recordType",
        "sourceNotes",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "person": [
        "file_name",
        "line_num",
        "authorityType",
        "biographicalNote",
        "logicalDate_birthDate",
        "logicalDate_deathDate",
        "fullerFormOfName",
        "heading",
        "importRecordControlNumber",
        "contributor_linkCounts",
        "creator_linkCounts",
        "donor_linkCounts",
        "subject_linkCounts",
        "totalDescription_linkCounts",
        "naId",
        "name",
        "numerator",
        "authorityType_organizationalReferences",
        "heading_organizationalReferences",
        "naId_organizationalReferences",
        "personalTitle",
        "recordSource",
        "recordType",
        "contributor_role",
        "creator_role",
        "donor_role",
        "reference_role",
        "sourceNotes",
        "fullerFormOfName_variantPersonNames",
        "heading_variantPersonNames",
        "name_variantPersonNames",
        "numerator_variantPersonNames",
        "personalTitle_variantPersonNames",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "specificRecordsTypes": [
        "file_name",
        "line_num",
        "authorityType",
        "naId_broaderTerms",
        "name_broaderTerms",
        "heading",
        "importRecordControlNumber",
        "specificRecordsType_linkCounts",
        "subject_linkCounts",
        "totalDescription_linkCounts",
        "naId",
        "naId_narrowerTerms",
        "heading_narrowerTerms",
        "recordType",
        "recordSource",
        "naId_relatedTerms",
        "heading_relatedTerms",
        "scopeNote",
        "sourceNotes",
        "useFor",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
    "topicalSubject": [
        "file_name",
        "line_num",
        "authorityType",
        "naId_broaderTerms",
        "name_broaderTerms",
        "heading",
        "subject_linkCounts",
        "topicalSubject_linkCounts",
        "totalDescription_linkCounts",
        "naId",
        "naId_narrowerTerms",
        "heading_narrowerTerms",
        "naId_relatedTerms",
        "heading_relatedTerms",
        "recordType",
        "recordSource",
        "scopeNote",
        "sourceNotes",
        "useFor",
        "text",
        "chunkSeqId",
        "chunkId",
    ],
}


def generate_merge_query(label, properties):
    set_clauses = ",\n".join([f"    n.{p} = $nodeParam.{p}" for p in properties])
    return f"""MERGE (n:{label} {{chunkId: $nodeParam.chunkId}})
    SET
    {set_clauses}
    RETURN n"""


queries = {}
for label, props in properties_by_type.items():
    queries[label] = generate_merge_query(label, props)

# Create Constraints
This section creates unique constraints in Neo4j for each node type to ensure chunkId properties are unique.

In [None]:
for name, label in constraints:
    cypher = f"""CREATE CONSTRAINT {name} IF NOT EXISTS
    FOR (n:{label}) REQUIRE n.chunkId IS UNIQUE"""
    kg.query(cypher)

# Create Nodes in Neo4j
This section iterates over the processed results, creates nodes in Neo4j using the appropriate merge query, and logs success or errors.

In [None]:
logger.info("=" * 40)
logger.info("Starting node creation")
logger.info("=" * 40)
success_count = 0
error_count = 0

for idx, node_data in enumerate(tqdm(result, desc="Nodes"), 1):
    try:
        logger.info(f"Creating node {idx}/{len(result)}")

        if node_data.get("levelOfDescription") is None:
            kg.query(
                queries[node_data["authorityType"]], params={"nodeParam": node_data}
            )
        else:
            kg.query(
                queries[node_data["levelOfDescription"]],
                params={"nodeParam": node_data},
            )

        success_count += 1

    except Exception as e:
        error_count += 1
        logger.error(f"Error processing node {idx}: {str(e)}")
        continue

logger.info("=" * 40)
logger.info(
    f"Node processing completed. Successful: {success_count}; Errors: {error_count}"
)
logger.info("=" * 40)

# Set Source Property and Add recordWithText Label
This section sets a 'source' property on all nodes linking to their catalog URL based on naId and adds a 'recordWithText' label to nodes that have text for vector indexing.

In [None]:
kg.query(
    """CALL apoc.periodic.iterate(
  "MATCH (n) WHERE n.source IS NULL RETURN n",
  "SET n.source = 'https://catalog.archives.gov/id/' + toString(n.naId)",
  {batchSize:1000})"""
)

kg.query(f"MATCH (n) WHERE n.text <> 'N/A' SET n:{VECTOR_NODE_LABEL}")

# Define Embedding Class and Embedding Generation Function
This section defines a custom embedding class using the loaded model and a function to generate and set text embeddings in batches for nodes without them.

In [None]:
class Embeddings:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def embed_query(self, text):
        return self._embed([text])[0]

    def embed_documents(self, texts):
        return self._embed(texts)

    def _embed(self, texts):
        encoded_input = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt",
        ).to(self.device)

        with torch.no_grad():
            model_output = self.model(**encoded_input)

        embeddings = model_output.last_hidden_state[:, 0]
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        return embeddings.to(device).numpy().tolist()


my_embeddings = Embeddings(embedder_model, embedder_tokenizer, device)


def generate_and_set_embeddings(label: str, batch_size: int = 32):
    logger.info("=" * 40)
    logger.info(f"Starting embedding creation")
    logger.info("=" * 40)

    nodes = kg.query(
        f"MATCH (n:{label}) WHERE n.{VECTOR_EMBEDDING_PROPERTY} IS NULL RETURN elementId(n) AS node_id"
    )

    if not nodes:
        logger.info("=" * 40)
        logger.info(f"All nodes {label} already have embeddings")
        logger.info("=" * 40)
        print(f"All nodes {label} already have embeddings")
        return

    node_ids = [node["node_id"] for node in nodes]
    total_nodes = len(node_ids)
    logger.info(f"Found {total_nodes} {label} nodes without embeddings")
    print(f"Found {total_nodes} '{label}' nodes without embeddings")

    for i in range(0, total_nodes, 1000):
        batch_ids = node_ids[i : i + 1000]

        texts_result = kg.query(
            """UNWIND $batch_ids AS node_id
            MATCH (n) WHERE elementId(n) = node_id
            RETURN elementId(n) AS node_id, n.text AS text""",
            params={"batch_ids": batch_ids},
        )

        for j in range(0, len(texts_result), batch_size):
            model_batch = texts_result[j : j + batch_size]
            batch_texts = [item["text"] for item in model_batch]

            try:
                embeddings = my_embeddings.embed_documents(batch_texts)

                params_list = []
                for k, item in enumerate(model_batch):
                    params_list.append(
                        {"node_id": item["node_id"], "embedding": embeddings[k]}
                    )

                kg.query(
                    """UNWIND $batch AS item
                    MATCH (n) WHERE elementId(n) = item.node_id
                    CALL db.create.setNodeVectorProperty(n, "textEmbedding", item.embedding)""",
                    params={"batch": params_list},
                )

            except Exception as e:
                logger.error(f"Error in batch {i + j}: {str(e)}")
                print(f"Error in batch {i + j}: {str(e)}")

        logger.info(f"Processed {min(i + 1000, total_nodes)}/{total_nodes} nodes")
        print(f"Processed {min(i + 1000, total_nodes)}/{total_nodes} nodes")

    logger.info("=" * 40)
    logger.info(f"Embedding update completed")
    logger.info("=" * 40)
    print(f"Embedding update completed")


generate_and_set_embeddings(VECTOR_NODE_LABEL)

# Create Vector Index
This section creates a vector index in Neo4j for similarity searches on text embeddings.

In [None]:
kg.query(
    """CREATE VECTOR INDEX `texts_from_records` IF NOT EXISTS
    FOR (n:recordWithText) ON (n.textEmbedding)
    OPTIONS { indexConfig: {
        `vector.dimensions`: 1024,
        `vector.similarity_function`: 'cosine'
    }}"""
)

# Create Next Relationships Between Chunks and Includes Relationships for Ancestors
This section creates 'Next' relationships between consecutive nodes with text chunks of the same document and 'Includes' relationships based on ancestor hierarchies for the first chunk nodes.

In [None]:
kg.query(
    """MATCH (n)
WITH n.naId AS docId, n
ORDER BY docId, n.chunkSeqId
WITH docId, collect(n) AS nodes
UNWIND range(0, size(nodes)-2) AS idx
WITH nodes[idx] AS current, nodes[idx+1] AS next
MERGE (current)-[:Next]->(next)"""
)

kg.query(
    """MATCH (a), (b)
WHERE b.naId = a.ancestors[0]
AND a.chunkSeqId = 0
AND b.chunkSeqId = 0
MERGE (b)-[:Includes]->(a)"""
)

# Create Various Relationships
This section block defines and executes queries to create relationships like broaderTerm, contributor, creator, etc., between nodes based on property values.

In [None]:
relations = [
    ("naId_broaderTerms", "broaderTerm"),
    ("naId_contributors", "contributor"),
    ("naId_creators", "creator"),
    ("naId_subjects", "subject"),
    ("naId_donors", "donor"),
    ("naId_narrowerTerms", "narrowerTerm"),
    ("naId_organizationalReferences", "organizationalReference"),
    ("naId_relatedTerms", "relatedTerm"),
    ("naId_jurisdiction", "jurisdiction"),
    ("naId_organizationNames", "organizationName"),
    ("naId_personalReferences", "personalReference"),
]

for prop, rel in relations:
    cypher = f"""
    MATCH (a)
    WHERE a.{prop} IS NOT NULL AND a.{prop} <> 'N/A' AND a.chunkSeqId = 0
    UNWIND a.{prop} AS id
    MATCH (b)
    WHERE b.naId = id AND b.chunkSeqId = 0
    MERGE (b)-[:{rel}]->(a)
    """
    kg.query(cypher)

# Log Processing Completion
This section logs the successfully processed folders to track the completion of the data processing pipeline.

In [None]:
try:
    logger.info(f"Folder(s) processed:\n" + "\n".join(folders))
except Exception as e:
    logger.exception(f"Failed to process folder(s): {e}")