# task-utils for computing CSG metrics from Climate Impacts Lab downscaled data


In [31]:
import pandas as pd
import yaml
import s3fs
import numpy as np
from jupiter.task_utils import TaskSet, TaskInventory, TaskInstructions, TaskLauncher
from jupiter.aws.s3 import upload_s3_file

In [41]:
# Set up some parameters

params = {
    # Path to inventory file
    'freq_want': '1H',
    'save_location':'s3://jupiter-intern-projects/aarona/noah-mp-hue/${domain}/${start_date}_${end_date}/ICBC',
    'domain': 'milwaukee',
    'geogrid_file':'/home/jupiter/model/noahmp/geogrid-files/geo_em.d01.${domain}.nc',
    
}


batch_queue = 'csp-dev-wrf' #special queue for me
taskset_name = 'milwaukee-2008-test' # this is called aarona-noah-mp-hue



## Find expected output files
Load the inventory file to figure out how many output files we need to loop over/should expect.

In [42]:
start_dates = ['2008-03-01']
end_dates = ['2008-11-02']

params['start_dates'] = start_dates
params['end_dates'] = end_dates
    

## Formatting task-utils YAML

This dictionary should include the key components to the task-utils yaml file.  The lists of download parameters from the dataframe above are programmatically inserted.

In [43]:
cmd_str = 'python /home/jupiter/model/noahmp/generate-era5-boundary-conditions.py --start-date ${start_date} --end-date ${end_date} --freq ${freq_want} --save-location ${save_location} --geogrid-file ${geogrid_file}'
print(cmd_str)



python /home/jupiter/model/noahmp/generate-era5-boundary-conditions.py --start-date ${start_date} --end-date ${end_date} --freq ${freq_want} --save-location ${save_location} --geogrid-file ${geogrid_file}


In [44]:
output_dict = {
    'name' : taskset_name,
    'labels' : {
        'env' : 'dev',
        'project' : 'csg',
        'S3_ROOT' : 's3://jupiter-intern-projects/aarona', #going to point to my s3 space
    },
    'definitions' : params,
    'launch_settings' : {
        'batch' : {
            #'job_def':f'eos-external-data-etl:3',
            'job_def': f'aarona-noah-mp-hue:1',
            'queue' : batch_queue,
            #'overrides' : {
            #    'vcpus' : 4,
            #    'memory' : 16000,
            #},
            'overrides' : {
                "resourceRequirements": [
                    {
                        "type": "MEMORY",
                        "value" : "16000" #MB
                    },
                    {
                        "type": "VCPU",
                        "value" : "4"
                    },
                ]
            }
        },
        'run_keys' : {
            'command_string' : cmd_str
        }
    },
    'indicators' : {
        #'completed' : {
        #    'components' : ['${outpath}'],
        #   'method' : 's3_sensor'
       # },
    },
    'loops': {
        'start_date' : 'start_dates',
    },
    'mapped_loops' : {
        'end_date' : {'start_date' : 'end_dates'},
       # 'gcm' : {'job_index' : 'gcms'},
       # 'scenario' : {'job_index' : 'scenarios'},
       # 'year' : {'job_index' : 'years'}
    }
        
    }
    


In [45]:
# Write this to a yaml file
with open(f'{taskset_name}.yaml', 'w') as outfile:
    docs = yaml.dump(output_dict, outfile)

## Run with task-utils

This follows the normal task-utils sequence.  Start by making an instruction list (which should also help confirm our YAML was formatted correctly).

In [46]:
DRY_RUN = False# if True, no batch runs will actually be launched


ts = TaskSet(f'{taskset_name}.yaml',sync='overwrite')
instr = TaskInstructions(ts)

2022-07-22 19:37:00,280 | DEBUG      | jupiter.task_utils.task_set.TaskSet:_create_s3_root_definition:286 | S3_ROOT is already defined: s3://jupiter-intern-projects/aarona
2022-07-22 19:37:00,346 | INFO       | jupiter.task_utils.task_set.TaskSet:_sync:393 | TaskSet does not yet exist on S3; saving to s3://jupiter-intern-projects/aarona/task_utils/spec_files/milwaukee-2008-test.yaml


In [47]:
instr.df.shape

(1, 5)

In [48]:
instr.df.head()

Unnamed: 0,start_date,S3_ROOT,env,project,end_date
0,2008-03-01,s3://jupiter-intern-projects/aarona,dev,csg,2008-11-02


## Build the test set

In [49]:
# use this for the basic testing mode; just do the first X runs
test_mode = 'basic'
number_of_tests = 1

# use this if you want your tests to comprise of one run for each value of a specified loop (or multiple loops)
#test_mode = "one_per_loop"
#loop_names = ["peril"]

# use this if you want to specify a custom query to define your test set
#test_mode = "query"
#test_query = f"(tileid in {testset}) and (projection_scenario == 'ssp585') and (metric == 'windSpeed500yr')"#" and scenario == 'worst_one'"


###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################
load_existing_inventory_for_tests = True

task_inv = TaskInventory(ts, load_existing = load_existing_inventory_for_tests)
if test_mode == 'basic': task_inv.assign_basic_test_set(number_in_set = number_of_tests)
elif test_mode == 'one_per_loop': task_inv.assign_test_set_by_loop(loops=loop_names, combinations=True)
elif test_mode == 'query': task_inv.assign_test_set_by_query(query_str=test_query)
else: raise ValueError(f"test_mode {test_mode} not valid")
task_inv.save()

task_instructions_test = TaskInstructions(ts)
task_instructions_test.filter_tests_only()
task_instructions_test.preview()
print(f'Notebook DRY_RUN value is set to {DRY_RUN}')

2022-07-22 19:37:05,562 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:assign_basic_test_set:641 | Assigned the first 1 entries as the test set
2022-07-22 19:37:05,656 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:save:387 | Saved inventory to s3://jupiter-intern-projects/aarona/task_utils/inventories/milwaukee-2008-test.csv
2022-07-22 19:37:05,688 | INFO       | jupiter.task_utils.task_instructions.TaskInstructions:filter_tests_only:218 | Subsetting instructions to only run tasks flagged as tests
2022-07-22 19:37:05,729 | DEBUG      | jupiter.task_utils.concurrent_s3_client:check_s3_object_exists:161 | Verified that object s3://jupiter-intern-projects/aarona/task_utils/inventories/milwaukee-2008-test.csv exists
2022-07-22 19:37:05,774 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:_initialize_df:156 | Loaded inventory from s3://jupiter-intern-projects/aarona/task_utils/inventories/milwaukee-2008-test.csv
2022-07-22 19:37:05,780 | INFO  

## Launch test jobs here

Please use job arrays to make this easier!

In [50]:
use_job_arrays = False
job_array_split_criteria = None #'peril'

## Advanced options, please do not use rashly!
num_attempts = None
timeout = None

# See step 4 for details
jobs_per_execution = 1
group_criteria = None


###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################

instructions_file_test = task_instructions_test.write_instructions()
tl_test = TaskLauncher(instructions_file_test)
if use_job_arrays: tl_test.launch_via_aws_job_array(dry_run = DRY_RUN, split_on = job_array_split_criteria, jobs_per_execution = jobs_per_execution, group_on = group_criteria, num_attempts = num_attempts, timeout = timeout)
else: tl_test.launch_via_aws_batch(dry_run = DRY_RUN, num_attempts = num_attempts, timeout = timeout)

2022-07-22 19:37:11,156 | INFO       | jupiter.task_utils.task_instructions.TaskInstructions:write_instructions:353 | Wrote instruction_file to s3://jupiter-intern-projects/aarona/task_utils/instruction_files/instructions_milwaukee-2008-test_20220722_193711.yaml
2022-07-22 19:37:11,249 | INFO       | jupiter.task_utils.task_launcher.TaskLauncher:launch_via_aws_batch:256 | Launching 1 AWS Batch Jobs on queue csp-dev-wrf with job definition aarona-noah-mp-hue:1
2022-07-22 19:37:11,251 | INFO       | jupiter.task_utils.task_launcher.TaskLauncher:_submit_single_batch_job:789 | Submitting single job milwaukee-2008-test_2008-03-01 to queue csp-dev-wrf with job definition aarona-noah-mp-hue:1
2022-07-22 19:37:11,252 | DEBUG      | jupiter.task_utils.task_launcher.TaskLauncher:_submit_single_batch_job:792 | Submission: {'jobName': 'milwaukee-2008-test_2008-03-01', 'jobQueue': 'csp-dev-wrf', 'jobDefinition': 'aarona-noah-mp-hue:1', 'containerOverrides': ordereddict([('resourceRequirements', [or

## Check status of test runs

In [63]:
# No options here, just execute it!

###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################

load_existing_inventory = True
skip_s3_datacheck = {"completed":1}
also_update_batch_status = True
batch_skip_statuses = ["SUCCEEDED"]
num_processors = None

task_inv = TaskInventory(ts, load_existing = load_existing_inventory)
task_inv.update_status(skip_values = skip_s3_datacheck, also_update_batch_status = also_update_batch_status, nproc=num_processors, batch_skip_statuses = batch_skip_statuses, tests_only = True)
task_inv.print_summary(tests_only = True)
if also_update_batch_status: task_inv.print_batch_summary(tests_only = True)
task_inv.save()
#task_inv.apply_style(task_inv.test_df)

2022-07-22 19:39:22,896 | DEBUG      | jupiter.task_utils.concurrent_s3_client:check_s3_object_exists:161 | Verified that object s3://jupiter-intern-projects/aarona/task_utils/inventories/milwaukee-2008-test.csv exists
2022-07-22 19:39:22,988 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:_initialize_df:156 | Loaded inventory from s3://jupiter-intern-projects/aarona/task_utils/inventories/milwaukee-2008-test.csv
2022-07-22 19:39:22,990 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:update_batch_status:455 | Updating Batch job status for milwaukee-2008-test
2022-07-22 19:39:23,001 | DEBUG      | jupiter.task_utils.task_inventory.TaskInventory:_query_batch_status:506 | Check batch status for jobs 0:100
2022-07-22 19:39:23,316 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:update_batch_status:457 | Batch job status update complete!
2022-07-22 19:39:23,319 | INFO       | jupiter.task_utils.task_inventory.TaskInventory:print_summary:414 | ----

In [61]:
task_inv.apply_style(task_inv.test_df)

Unnamed: 0,start_date,end_date,LAST_KNOWN_JOB_STATUS,CLOUDWATCH_LOGS,JOB_STATUS_REASON,SPOT_TERMINATED,IN_TEST_SET,LAST_KNOWN_JOB_ID,LAST_JOB_CHECK,JOB_DURATION
0,2008-03-01,2008-11-02,RUNNABLE,--,UNKNOWN,0,✓,8ea59673-01c7-4de1-8781-9eb7d9eaae9e,2022-07-22T19:38:44,--


In [40]:
task_inv.test_df['CLOUDWATCH_LOGS'].iloc[0]

'https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/%252Faws%252Fbatch%252Fjob/log-events/aarona-noah-mp-hue%252Fdefault%252F9a167c2522104009a1133d532de65771'

## Rerun failures of test runs

In [None]:
#filter_query = 'completed != 1 and LAST_KNOWN_JOB_STATUS not in ["SUBMITTED","PENDING","STARTING","RUNNABLE","RUNNING"] and IN_TEST_SET == True'
filter_query = 'LAST_KNOWN_JOB_STATUS == "FAILED" and IN_TEST_SET == True and completed != 1'

update_first = True
skip_s3_datacheck = {'completed': 1}
also_update_batch_status = True
batch_skip_statuses = ['SUCCEEDED']
num_processors = None

###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################

rerun_instructions = TaskInstructions(ts)
rerun_instructions.filter_on_inventory(query = filter_query, update_first=update_first, skip_values = skip_s3_datacheck, also_update_batch_status = also_update_batch_status, nproc=num_processors, batch_skip_statuses = batch_skip_statuses)
#rerun_instructions.filter_tests_only()
rerun_instructions.preview()
print(f'Notebook DRY_RUN value is set to {DRY_RUN}')

In [None]:
rerun_instructions_file_test = rerun_instructions.write_instructions()
tl_test = TaskLauncher(rerun_instructions_file_test)
use_job_arrays = False
if use_job_arrays: tl_test.launch_via_aws_job_array(dry_run = DRY_RUN, split_on = job_array_split_criteria, jobs_per_execution = jobs_per_execution, group_on = group_criteria, num_attempts = num_attempts, timeout = timeout)
else: tl_test.launch_via_aws_batch(dry_run = DRY_RUN, num_attempts = num_attempts, timeout = timeout)
use_job_arrays = True
# Go back up and re-check the status

# FOR RUNNING ALL TASKS
## Create instructions for all remaining runs

In [None]:
## Create instructions and verify preview
task_instructions = TaskInstructions(ts)
task_instructions.filter_tests_excluded()  # comment this line if you want to run EVERYTHING, even previous tests
#task_instructions.preview()
print(f'Notebook DRY_RUN value is set to {DRY_RUN}')

## Launch all runs
Please use job arrays!

In [None]:
use_job_arrays = True
job_array_split_criteria = None # example only, adjust for your use case

## Advanced options, please do not use rashly!
num_attempts = None
timeout = None

# Set one of these options to run multiple commands in a loop
# within a single container execution. Your container may need 
# special code to handle this properly!
jobs_per_execution = 5
group_criteria = None


###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################

instructions_file = task_instructions.write_instructions()
tl = TaskLauncher(instructions_file)

if use_job_arrays: tl.launch_via_aws_job_array(dry_run = DRY_RUN, split_on = job_array_split_criteria, jobs_per_execution = jobs_per_execution, group_on = group_criteria, num_attempts = num_attempts, timeout = timeout)
else: tl.launch_via_aws_batch(dry_run = DRY_RUN, num_attempts = num_attempts, timeout = timeout)

In [None]:
load_existing_inventory = True
skip_s3_datacheck = {'completed': 1}
also_update_batch_status = True
batch_skip_statuses = ['SUCCEEDED']
num_processors = None

###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################

task_inv = TaskInventory(ts, load_existing = load_existing_inventory)
task_inv.update_status(skip_values = skip_s3_datacheck, also_update_batch_status = also_update_batch_status, nproc=num_processors, batch_skip_statuses = batch_skip_statuses)
task_inv.print_summary()
if also_update_batch_status: task_inv.print_batch_summary()
task_inv.save()
#.styled_df

In [None]:
task_inv.apply_style(task_inv.df[task_inv.df['LAST_KNOWN_JOB_STATUS']=='FAILED'])

In [None]:
task_inv.df[task_inv.df['LAST_KNOWN_JOB_STATUS']=='FAILED']['tileid'].unique()

## Rerun failures

In [None]:
#filter_query = 'succeeded != 1 and LAST_KNOWN_JOB_STATUS not in ["SUBMITTED","PENDING","STARTING","RUNNABLE","RUNNING"]'
#filter_query = '(completed != 1)'
filter_query = 'LAST_KNOWN_JOB_STATUS in ["FAILED"]'

update_first = False
skip_s3_datacheck = {'completed': 1}
also_update_batch_status = False
batch_skip_statuses = ['SUCCEEDED']
num_processors = None

###################  ↑↑   OPTIONS   ↑↑  ###################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
###################  ↓↓ DON'T TOUCH ↓↓  ###################

rerun_instructions = TaskInstructions(ts)
rerun_instructions.filter_on_inventory(query = filter_query, update_first=update_first, skip_values = skip_s3_datacheck, also_update_batch_status = also_update_batch_status, nproc=num_processors, batch_skip_statuses = batch_skip_statuses)
rerun_instructions.preview()
print(f'Notebook DRY_RUN value is set to {DRY_RUN}')

In [None]:
use_job_arrays = True
job_array_split_criteria = None # example only, adjust for your use case

## Advanced options, please do not use rashly!
num_attempts = None
timeout = None

# Set one of these options to run multiple commands in a loop
# within a single container execution. Your container may need 
# special code to handle this properly!
jobs_per_execution = 1
group_criteria = None

In [None]:
rerun_instructions_file = rerun_instructions.write_instructions()
tl_test = TaskLauncher(rerun_instructions_file)

if use_job_arrays: tl_test.launch_via_aws_job_array(dry_run = DRY_RUN, split_on = job_array_split_criteria, jobs_per_execution = jobs_per_execution, group_on = group_criteria, num_attempts = num_attempts, timeout = timeout)
else: tl_test.launch_via_aws_batch(dry_run = DRY_RUN, num_attempts = num_attempts, timeout = timeout)
    
# Go back up and re-check the status