In [None]:
# Databricks notebook: Download artifacts for latest run of a given task of THIS job
# ---------------------------------------------------------------------------------

# PARAMETERS
# ----------
dbutils.widgets.text("TASK_KEY", "")               # task_key (task name) within this job
dbutils.widgets.text("SOURCE_PATH", "")            # subfolder inside artifacts, empty for root
dbutils.widgets.text("DEST_PATH", "")   # local/DBFS dest on this cluster

In [None]:
TASK_KEY = dbutils.widgets.get("TASK_KEY").strip()
SOURCE_PATH = dbutils.widgets.get("SOURCE_PATH").strip()
DEST_PATH = dbutils.widgets.get("DEST_PATH").strip()

In [None]:
if not TASK_KEY:
    raise ValueError("TASK_KEY widget must be provided (task_key within the job).")

In [None]:
import dbruntime.databricks_repl_context as repl_ctx
import subprocess
from pathlib import Path


In [None]:
def run_cmd(cmd: str) -> str:
    """Run a shell command and return stdout as string. Raises on non-zero exit."""
    print(f"Executing: {cmd}")
    result = subprocess.run(
        cmd,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(
            f"Command failed ({result.returncode}).\n"
            f"STDOUT:\n{result.stdout}\n\nSTDERR:\n{result.stderr}"
        )
    return result.stdout

In [None]:
def ensure_dir(path: str):
    Path(path).mkdir(parents=True, exist_ok=True)

In [None]:
# 1. Get JOB_ID of *this* notebook's job
# --------------------------------------
ctx = repl_ctx.get_context()
if not ctx.jobId().isDefined():
    raise RuntimeError("This notebook is not running inside a job; no jobId is available.")

JOB_ID = ctx.jobId().get()
print(f"Current job id (JOB_ID) = {JOB_ID}")

In [None]:
# 2. Ensure Jobs API 2.1 for CLI
# ------------------------------
run_cmd("databricks jobs configure --version=2.1")

In [None]:
# 3. Find latest JOB_RUN_ID of this JOB_ID that has TASK_KEY
# ----------------------------------------------------------
# Command:
#   databricks jobs list-runs --job-id JOB_ID --limit 50 --output JSON
# Then jq:
#   filter runs whose .tasks contains TASK_KEY, sort by start_time desc, pick first .run_id

jq_filter_runs = rf'''
[
  .runs[]
  | select(any(.tasks[]?; .task_key == "{TASK_KEY}"))
]
| sort_by(.start_time) | reverse
| .[0].run_id
'''


In [None]:
cmd_list_runs = (
    f'databricks jobs list-runs --job-id "{JOB_ID}" --limit 50 --output JSON '
    f'| jq -r \'{jq_filter_runs}\''
)

In [None]:
job_run_id_output = run_cmd(cmd_list_runs).strip()



In [None]:
if not job_run_id_output or job_run_id_output == "null":
    raise RuntimeError(
        f"No runs found for job_id={JOB_ID} that contain task_key='{TASK_KEY}'."
    )

In [None]:
JOB_RUN_ID = job_run_id_output
print(f"Latest JOB_RUN_ID={JOB_RUN_ID} for JOB_ID={JOB_ID}, TASK_KEY={TASK_KEY}")

In [None]:
# 4. Get TASK_RUN_ID for that JOB_RUN_ID and TASK_KEY
# ---------------------------------------------------
# Command:
#   databricks runs get --run-id JOB_RUN_ID
# Then jq:
#   .tasks[] | select(.task_key=="TASK_KEY") | .run_id

In [None]:
jq_filter_task_run = f'.tasks[] | select(.task_key == "{TASK_KEY}") | .run_id'
cmd_get_task_run_id = (
    f'databricks runs get --run-id "{JOB_RUN_ID}" '
    f'| jq -r \'{jq_filter_task_run}\''
)

In [None]:

task_run_id_output = run_cmd(cmd_get_task_run_id).strip()
if not task_run_id_output or task_run_id_output == "null":
    raise RuntimeError(
        f"No task with task_key='{TASK_KEY}' found in job run {JOB_RUN_ID}."
    )

TASK_RUN_ID = task_run_id_output
print(
    f"TASK_RUN_ID={TASK_RUN_ID} for JOB_ID={JOB_ID}, "
    f"JOB_RUN_ID={JOB_RUN_ID}, TASK_KEY={TASK_KEY}"
)

In [None]:
# 5. Download artifacts for that TASK_RUN_ID
# ------------------------------------------
ensure_dir(DEST_PATH)

base_cmd = (
    f'databricks runs download-artifacts '
    f'--run-id "{TASK_RUN_ID}" '
    f'--dest "{DEST_PATH}"'
)
if SOURCE_PATH:
    base_cmd += f' --path "{SOURCE_PATH}"'

print("Downloading artifacts:")
print(f"  job_id      = {JOB_ID}")
print(f"  job_run_id  = {JOB_RUN_ID}")
print(f"  task_run_id = {TASK_RUN_ID}")
print(f"  task_key    = {TASK_KEY}")
print(f"  source path = '{SOURCE_PATH or '/'}'")
print(f"  dest path   = '{DEST_PATH}'")

download_output = run_cmd(base_cmd)
print(download_output)

print("Artifact download complete.")