### DPS Coordinator

This notebook interacts with the MAAP API. It submits and runs all DPS jobs in the same order they run inside `fireatlas.FireRunDaskCoordinator.py`. Each step runs separately and blocks for successful output.

The `poll_on_job_status` and `wait_for_job` allow us to block and get DPS job status for muliptle jobs before we continue on

In [None]:
#!pip install -e ..

In [55]:
import functools
import json
import argparse
import os
import time
from typing import Tuple
import concurrent
from concurrent.futures import ThreadPoolExecutor

from fireatlas import preprocess
from fireatlas import FireTime
from fireatlas.FireLog import logger
from fireatlas.FireTypes import Region, TimeStep
from fireatlas.utils import timed
from fireatlas import FireConsts
from fireatlas import FireIO

from maap.maap import MAAP
from maap.dps.dps_job import DPSJob
from maap.utils import algorithm_utils


class JobSubmissionException(Exception):
    pass


def get_algorithm_config_filepath(dir_names):
    current_file_dir = os.path.dirname(os.path.abspath(__name__))
    return [
        os.path.join(current_file_dir, f'{dir_name}', 'algorithm_config.yaml')
        for dir_name in dir_names
    ]


def validate_job_submission(submitted_jobs: Tuple[DPSJob]) -> Tuple[DPSJob]:
    """we don't retry job submissions, they should ideally always work

    validate status of job submission results and return result 'job_id'
    """
    failed_statuses = [result for result in submitted_jobs if result.status == 'failed']
    if any(failed_statuses):
        raise JobSubmissionException(f"[ SUBMISSION FAILED ]: the following jobs failed to submit {failed_statuses}")
    return submitted_jobs


def wait_for_job(dps_job: DPSJob) -> DPSJob:
    """this internal DPSJob function will block until job completes and use exponential backoff
    https://github.com/MAAP-Project/maap-py/blob/master/maap/dps/dps_job.py#L80C9-L80C28

    it seems the statuses.lower() are: ['failed', 'succeeded', 'accepted', 'running']
    https://github.com/MAAP-Project/maap-py/blob/master/maap/dps/dps_job.py
    """
    return dps_job.wait_for_completion()


def poll_on_job_status(jobs: Tuple[DPSJob]) -> Tuple[DPSJob]:
    failed_jobs = []
    # don't want to overwhelm the MAAP api so keeping max_workers relatively small
    with ThreadPoolExecutor(max_workers=5) as executor:
        dps_job_futures = [executor.submit(wait_for_job, dps_job) for dps_job in jobs]
        for dps_job in concurrent.futures.as_completed(dps_job_futures):
            try:
                if dps_job.result().retrieve_status().lower() != 'succeeded':
                    failed_jobs.append(dps_job)
            except Exception as e:
                logger.exception(f"'poll_on_jobs_status' failed with {e}")
    return failed_jobs


def track_submitted_jobs(submitted_jobs: Tuple[DPSJob]) -> Tuple[DPSJob]:
    queued_jobs = validate_job_submission(submitted_jobs)
    failed_jobs = poll_on_job_status(queued_jobs)
    return failed_jobs

In [56]:
tst = [2023,1,1,'AM']
ted = [2023,3,1,'PM']
region = ["ShastaTrinity", [-124.354248,40.624376,-122.250366,42.045213]]
list_of_time_steps = list(FireTime.t_generator(tst, ted))

#### Data Update Checker

In [58]:
configs = get_algorithm_config_filepath(['coordinator',])
maap_api = MAAP(maap_host='api.maap-project.org')
algo_config = algorithm_utils.read_yaml_file(configs[0])
algo_config.pop('inputs')
print(algo_config)

submitted_jobs = []
submit_job_kwargs = {
    "identifier": f"job-{algo_config['algorithm_name']}:{algo_config['algorithm_version']}",
    "algo_id": algo_config["algorithm_name"],
    "version": algo_config["algorithm_version"],
    "username": "gcorradini",
    "queue": algo_config["queue"],
}

param_kwargs = {"regnm": region[0], "tst":  json.dumps(tst), 
                "bbox": region[1], "ted": json.dumps(ted), "operation": "--data-update"}
result = maap_api.submitJob(**submit_job_kwargs, **param_kwargs)
submitted_jobs.append(result)
queued_jobs = validate_job_submission(submitted_jobs)

{'algorithm_name': 'eis-feds-dask-coordinator-v3', 'algorithm_description': 'coordinator for all regional jobs, preprocess and FireForward steps', 'algorithm_version': '1.0.0', 'environment': 'ubuntu', 'repository_url': 'https://repo.maap-project.org/gcorradini/fireatlas_nrt.git', 'docker_container_url': 'mas.maap-project.org/root/maap-workspaces/base_images/vanilla:v3.1.4', 'queue': 'maap-dps-eis-worker-64gb', 'run_command': 'fireatlas_nrt/maap_runtime/run_dps_cli.sh', 'build_command': 'fireatlas_nrt/maap_runtime/run_dps_build.sh', 'disk_space': '100GB'}


#### Job Monitoring

In [59]:
[maap_api.getJobStatus(job.id) for job in queued_jobs]

['Accepted']

In [60]:
%%time
failed_jobs = poll_on_job_status(queued_jobs)

INFO:backoff:Backing off wait_for_completion(...) for 0.2s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.2s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.8s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 5.5s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 2.0s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 17.9s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 18.0s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 7.2s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.6s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 44.3s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 39.4s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 14.4s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 34.5s (RuntimeError)
INFO:backoff:Backing off wait_fo

CPU times: user 540 ms, sys: 176 ms, total: 715 ms
Wall time: 8min 41s


### Preprocess Region

In [61]:
configs = get_algorithm_config_filepath(['coordinator',])
maap_api = MAAP(maap_host='api.maap-project.org')
algo_config = algorithm_utils.read_yaml_file(configs[0])
algo_config.pop('inputs')
print(algo_config)

submitted_jobs = []
submit_job_kwargs = {
    "identifier": f"job-{algo_config['algorithm_name']}:{algo_config['algorithm_version']}",
    "algo_id": algo_config["algorithm_name"],
    "version": algo_config["algorithm_version"],
    "username": "gcorradini",
    "queue": algo_config["queue"],
}
param_kwargs = {"regnm": region[0], "tst":  json.dumps(tst), 
                "bbox": region[1], "ted": json.dumps(ted), "operation": "--preprocess-region"}
result = maap_api.submitJob(**submit_job_kwargs, **param_kwargs)
submitted_jobs.append(result)
queued_jobs = validate_job_submission(submitted_jobs)

{'algorithm_name': 'eis-feds-dask-coordinator-v3', 'algorithm_description': 'coordinator for all regional jobs, preprocess and FireForward steps', 'algorithm_version': '1.0.0', 'environment': 'ubuntu', 'repository_url': 'https://repo.maap-project.org/gcorradini/fireatlas_nrt.git', 'docker_container_url': 'mas.maap-project.org/root/maap-workspaces/base_images/vanilla:v3.1.4', 'queue': 'maap-dps-eis-worker-64gb', 'run_command': 'fireatlas_nrt/maap_runtime/run_dps_cli.sh', 'build_command': 'fireatlas_nrt/maap_runtime/run_dps_build.sh', 'disk_space': '100GB'}


#### Job Monitoring

In [62]:
[maap_api.getJobStatus(job.id) for job in queued_jobs]

['Running']

In [63]:
%%time
failed_jobs = poll_on_job_status(queued_jobs)

INFO:backoff:Backing off wait_for_completion(...) for 0.3s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 1.3s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.2s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.9s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.3s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 20.8s (RuntimeError)


CPU times: user 174 ms, sys: 41.7 ms, total: 216 ms
Wall time: 24.2 s


### Preprocess Region T

In [None]:
configs = get_algorithm_config_filepath(['operation',])
maap_api = MAAP(maap_host='api.maap-project.org')
algo_config = algorithm_utils.read_yaml_file(configs[0])
algo_config.pop('inputs')
print(algo_config)

submitted_jobs = []
submit_job_kwargs = {
    "identifier": f"job-{algo_config['algorithm_name']}:{algo_config['algorithm_version']}",
    "algo_id": algo_config["algorithm_name"],
    "version": algo_config["algorithm_version"],
    "username": "gcorradini",
    "queue": algo_config["queue"],
}
for t in list_of_time_steps:
    #print(t)
    param_kwargs = {"regnm": region[0], "tst":  json.dumps(t), 
                "bbox": region[1], "ted": json.dumps(ted), "operation": "--preprocess-region-t"}
    output_filepath = preprocess.preprocessed_filename(t, sat=FireConsts.firesrc, region=region, location="s3")
    if FireIO.os_path_exists(output_filepath):
        logger.info(f"skipping 'preprocess_region_t' b/c file \
        already exists for region {region[0]}, {output_filepath}")
        continue

    result = maap_api.submitJob(**submit_job_kwargs, **param_kwargs)
    submitted_jobs.append(result)
queued_jobs = validate_job_submission(submitted_jobs)

#### Job Monitoring

In [None]:
#[maap_api.getJobStatus(job.id) for job in queued_jobs]

In [None]:
%%time
failed_jobs = poll_on_job_status(queued_jobs)

In [None]:
len(failed_jobs)

#### FireForward

In [None]:
configs = get_algorithm_config_filepath(['coordinator',])
maap_api = MAAP(maap_host='api.maap-project.org')
algo_config = algorithm_utils.read_yaml_file(configs[0])
algo_config.pop('inputs')
print(algo_config)

submitted_jobs = []
submit_job_kwargs = {
    "identifier": f"job-{algo_config['algorithm_name']}:{algo_config['algorithm_version']}",
    "algo_id": algo_config["algorithm_name"],
    "version": algo_config["algorithm_version"],
    "username": "gcorradini",
    "queue": algo_config["queue"],
}
param_kwargs = {"regnm": region[0], "tst":  json.dumps(tst), 
                "bbox": region[1], "ted": json.dumps(ted), "operation": "--fire-forward"}

result = maap_api.submitJob(**submit_job_kwargs, **param_kwargs)
submitted_jobs.append(result)
queued_jobs = validate_job_submission(submitted_jobs)

#### Job Monitoring

In [None]:
[maap_api.getJobStatus(job.id) for job in queued_jobs]

In [None]:
%%time
failed_jobs = poll_on_job_status(queued_jobs)

In [None]:
len(failed_jobs)

In [None]:
#failed_jobs[0].result().retrieve_result()

#### Dask Coodinator

In [48]:
tst = [2023,1,1,'AM']
ted = [2023,7,1,'PM']
region = ["Oregon", [-124.925537,41.672912,-115.565186,46.513516]]

In [50]:
configs = get_algorithm_config_filepath(['coordinator',])
maap_api = MAAP(maap_host='api.maap-project.org')
algo_config = algorithm_utils.read_yaml_file(configs[0])
algo_config.pop('inputs')
print(algo_config)

submitted_jobs = []
submit_job_kwargs = {
    "identifier": f"job-{algo_config['algorithm_name']}:{algo_config['algorithm_version']}",
    "algo_id": algo_config["algorithm_name"],
    "version": algo_config["algorithm_version"],
    "username": "gcorradini",
    "queue": algo_config["queue"],
}
param_kwargs = {"regnm": region[0], "tst":  json.dumps(tst), 
                "bbox": region[1], "ted": json.dumps(ted), "operation": "--coordinate-all"}

result = maap_api.submitJob(**submit_job_kwargs, **param_kwargs)
submitted_jobs.append(result)
queued_jobs = validate_job_submission(submitted_jobs)

{'algorithm_name': 'eis-feds-dask-coordinator-v3', 'algorithm_description': 'coordinator for all regional jobs, preprocess and FireForward steps', 'algorithm_version': '1.0.0', 'environment': 'ubuntu', 'repository_url': 'https://repo.maap-project.org/gcorradini/fireatlas_nrt.git', 'docker_container_url': 'mas.maap-project.org/root/maap-workspaces/base_images/vanilla:v3.1.4', 'queue': 'maap-dps-eis-worker-64gb', 'run_command': 'fireatlas_nrt/maap_runtime/run_dask_coordinator.sh', 'build_command': 'fireatlas_nrt/maap_runtime/run_dps_build.sh', 'disk_space': '100GB'}


In [51]:
%%time
failed_jobs = poll_on_job_status(queued_jobs)

INFO:backoff:Backing off wait_for_completion(...) for 0.0s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 0.6s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 3.2s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 6.5s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 12.3s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 15.4s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 18.3s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 14.5s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 63.5s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 27.6s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 60.2s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 57.9s (RuntimeError)
INFO:backoff:Backing off wait_for_completion(...) for 44.4s (RuntimeError)
INFO:backoff:Backing off wait

CPU times: user 466 ms, sys: 32.2 ms, total: 499 ms
Wall time: 9min 36s


In [54]:
failed_jobs[0].result().retrieve_result()

['http://maap-ops-workspace.s3-website-us-west-2.amazonaws.com/dataset/triaged_job/eis-feds-dask-coordinator-v3/1.0.0/2024/03/30/264f7a19-37ea-4d88-8af3-31d378346423',
 's3://s3-us-west-2.amazonaws.com:80/maap-ops-workspace/dataset/triaged_job/eis-feds-dask-coordinator-v3/1.0.0/2024/03/30/264f7a19-37ea-4d88-8af3-31d378346423',
 'https://s3.console.aws.amazon.com/s3/buckets/maap-ops-workspace/dataset/triaged_job/eis-feds-dask-coordinator-v3/1.0.0/2024/03/30/264f7a19-37ea-4d88-8af3-31d378346423/?region=us-east-1&tab=overview',
 '++ python /app/get_username.py\n+ USERNAME=gcorradini\n+ DPS_MACHINE_TOKEN=Y6hERQKLKs0oH0h5Kgi3d-gVVyHgW_DR77YeARdzd-a3KJDE-gYldJzhB94s635KySUXi4r9WOtD6Vo_F9cEbfNI_QUhOG8N47heXHVcW1k3vEw3d0StttXYMOnQPtTzSQLAYKQwJLgbtX-R8ug1XGVeMolh97P9nHcW8awN7yjyaR7g\n++ curl -sb -H \'Accept: application/json\' -H \'Content-Type: application/json\' -H \'dps-token: Y6hERQKLKs0oH0h5Kgi3d-gVVyHgW_DR77YeARdzd-a3KJDE-gYldJzhB94s635KySUXi4r9WOtD6Vo_F9cEbfNI_QUhOG8N47heXHVcW1k3vEw3d0St