# Notebook to Reconcile Collection Metadata

This notebook reconciles the collections in `/ingestion-data/collections` and retrieves the `summaries` value for each collection from the API, merges it to the existing collection in `ghgc-data` and posts the new collection to the API.

In [15]:
import glob
import json
import requests

The following cell retrieves collection JSON files from `/ingestion-data/collections/` and save collectionIds to a list.

In [16]:
json_file_paths = glob.glob("../ingestion-data/collections/*.json")

file_paths_and_collection_ids = [
    {"filePath": file_path, "collectionId": data["id"]}
    for file_path in json_file_paths
    if "id" in (data := json.load(open(file_path, "r")))
]

Set the testing mode to `True` when testing and `False` otherwise. When the testing mode is `True`, the notebook will be set to run against `dev` endpoints.

In [17]:
testing_mode = True

Have your Cognito `username` and `password` ready to set up Cognito Client to retrieve a token that will be used to access the STAC Ingestor API.

In [18]:
base_url = "https://earth.gov/ghgcenter"
if testing_mode:
    # base_url = "https://staging.earth.gov/ghgcenter"
    base_url = "https://dev.ghg.center"
stac_endpoint = f"{base_url}/api/stac"
ingest_endpoint = f"{base_url}/api/ingest"

The following cell sets up headers for requests.

In [19]:
import os

def get_header(username, password):
    """
    Creates the authentication header to be passed to API requests
    """

    # Send the username and password to the /token endpoint to get the temporary token
    body = {
        "username": username,
        "password": password,
    }
    # request token
    response = requests.post(f"{base_url}/api/ingest/token", data=body)
    if not response.ok:
        raise Exception(
            "Couldn't obtain the token. Make sure the username and password are correct."
        )
    else:
        # get token from response
        token = response.json().get("AccessToken")
        # prepare headers for requests
        headers = {"Authorization": f"Bearer {token}"}
    return headers

headers = get_header(os.environ.get("USERNAME"), os.environ.get("PASSWORD"))

The following cell defines the functions that will be used to consolidate `summaries` and `links` to reconcile the collection metadata.

In [20]:
def post_reconciled_collection(collection, collection_id):
    collection_url = f"{stac_endpoint}/collections/{collection_id}"
    ingest_url = f"{ingest_endpoint}/collections"

    try:
        response = requests.post(ingest_url, json=collection, headers=headers)
        response.raise_for_status()
        if response.status_code == 201:
            print(
                f"Request was successful. Find the updated collection at {collection_url}"
            )
        else:
            print(
                f"Updating {collection_id} failed. Request failed with status code: {response.status_code}"
            )
    except requests.RequestException as e:
        print(
            f"Updating {collection_id} failed. An error occurred during the request: {e}"
        )
    except Exception as e:
        print(
            f"An unexpected error occurred while trying to update {collection_id}: {e}"
        )


def merge_summaries(existing_summaries, retrieved_summaries):
    merged_summaries_dict = existing_summaries.copy()

    if retrieved_summaries:
        for key, value in retrieved_summaries.items():
            merged_summaries_dict.setdefault(key, value)

    return merged_summaries_dict


def retain_external_links(existing_links, retrieved_links):
    unique_hrefs = set(link.get("href") for link in existing_links)
    additional_external_links = [
        link
        for link in retrieved_links
        if link.get("rel") == "external" and link.get("href") not in unique_hrefs
    ]

    retained_links = existing_links + additional_external_links
    return retained_links

The following cell loops through `file_paths_and_collection_ids` to retrieve `summaries` and `links` information for each existing collection and publish the updated collection to the target ingestion `api/collections` endpoint.

In [21]:
for item in file_paths_and_collection_ids:
    collection_id = item["collectionId"]
    file_path = item["filePath"]
    url = f"{stac_endpoint}/collections/{collection_id}"

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        json_response = response.json()

        retrieved_summaries = json_response.get("summaries", {})
        retrieved_links = json_response.get("links", {})

        with open(file_path, "r", encoding="utf-8") as file:
            collection = json.load(file)

            existing_summaries = collection.get("summaries", {})
            existing_links = collection.get("links", {})

            collection["summaries"] = merge_summaries(
                existing_summaries, retrieved_summaries
            )
            collection["links"] = retain_external_links(existing_links, retrieved_links)

        # Publish the updated collection to the target ingestion `api/collections` endpoint
        print(collection_id)
        json.dump(collection, open(f"temp/{collection_id}.json", "w"))
        # post_reconciled_collection(collection, collection_id)

    except requests.RequestException as e:
        print(f"An error occurred for collectionId {collection_id}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for collectionId {collection_id}: {e}")

gosat-based-ch4budget-yeargrid-v1
lpjeosim-wetlandch4-daygrid-v2
epa-ch4emission-yeargrid-v2express
odiac-ffco2-monthgrid-v2022
eccodarwin-co2flux-monthgrid-v5
emit-ch4plume-v1
lpjeosim-wetlandch4-monthgrid-v2
micasa-carbonflux-monthgrid-v1
tm54dvar-ch4flux-mask-monthgrid-v1
oco2geos-co2-daygrid-v10r
oco2-mip-co2budget-yeargrid-v1
micasa-carbonflux-daygrid-v1
sedac-popdensity-yeargrid5yr-v4.11
