# CKAN API Documentation (Python + Jupyter)
## Introduction

This notebook documents how to interact with a CKAN instance using its HTTP API from Python.
It focuses on practical, copy/paste-friendly examples for common tasks:

Read-only (no auth required): search datasets, read dataset/org/group metadata, list resources, etc.

Write operations (auth required): create, update, and delete (CUD) datasets, resources, organizations, etc.

Each section will build up from basics (connection + GET requests) to authenticated workflows, with clear request/response examples.

## Requirements
Python libraries

You’ll need:

- requests (HTTP calls)

- pandas (optional, for tabular outputs)

- python-dotenv (optional, load environment variables from a .env file)

- tqdm (optional, progress bars for bulk operations)

## CKAN access & credentials

- CKAN base URL (e.g., https://your-ckan-site.example)

- API key (required for Create/Update/Delete endpoints)

Generate or copy your API key from your CKAN user profile (or ask an admin).

Keep it secret (in `.env` file) and do not commit it into git.

Notes

Read-only endpoints usually work without an API key.

Authenticated requests typically pass the key via the Authorization header.

# Setup

In [1]:
# -----------------------------
# Imports
# -----------------------------
from __future__ import annotations

import json
import os
from typing import Any, Dict, Optional

from pprint import pprint

import requests
from dotenv import load_dotenv


# -----------------------------
# Configuration (env vars)
# -----------------------------
# Loads variables from a local .env file (if present)
# Example .env:
#   CKAN_BASE_URL=https://your-ckan-site.example
#   CKAN_API_KEY=xxxx-xxxx-xxxx-xxxx
load_dotenv()

CKAN_BASE_URL = os.getenv("CKAN_BASE_URL", "").rstrip("/")
CKAN_API_KEY = os.getenv("CKAN_API_KEY")  # required for Create/Update/Delete (CUD) actions

if not CKAN_BASE_URL:
    raise ValueError("CKAN_BASE_URL is not set. Add it to your .env or environment variables.")

ACTION_BASE = f"{CKAN_BASE_URL}/api/3/action"

# Helpers

In [2]:
# -----------------------------
# Helper: call CKAN Action API
# -----------------------------
def ckan_action(
    action: str,
    *,
    method: str = "GET",
    data: Optional[Dict[str, Any]] = None,
    params: Optional[Dict[str, Any]] = None,
    api_key: Optional[str] = None,
    timeout: int = 60,
) -> Dict[str, Any]:
    url = f"{ACTION_BASE}/{action}"
    method = method.upper()

    headers = {"Accept": "application/json"}
    if api_key:
        headers["Authorization"] = api_key

    def _parse_or_raise(resp: requests.Response) -> Dict[str, Any]:
        try:
            payload = resp.json()
        except Exception:
            text = (resp.text or "").strip()
            raise RuntimeError(f"Non-JSON response from CKAN (HTTP {resp.status_code}): {text[:500]}")

        if not resp.ok or payload.get("success") is False:
            raise RuntimeError(
                f"CKAN API error for action '{action}' (HTTP {resp.status_code}):\n"
                f"{json.dumps(payload, indent=2)}"
            )
        return payload

    def _looks_like_post_required(resp: requests.Response) -> bool:
        if resp.status_code != 400:
            return False
        body = (resp.text or "").lower()
        return "invalid request" in body and "use post method" in body

    if method == "GET":
        resp = requests.get(url, headers=headers, params=params, timeout=timeout)

        # Some CKAN/proxies require POST for Action API even for read-only actions
        if _looks_like_post_required(resp):
            # try POST JSON
            resp = requests.post(
                url,
                headers={**headers, "Content-Type": "application/json"},
                json=params or {},
                timeout=timeout,
            )
            # fallback to form-encoded if still complaining
            if _looks_like_post_required(resp):
                resp = requests.post(
                    url,
                    headers=headers,
                    data=params or {},
                    timeout=timeout,
                )

        return _parse_or_raise(resp)

    if method == "POST":
        resp = requests.post(
            url,
            headers={**headers, "Content-Type": "application/json"},
            params=params,
            json=data or {},
            timeout=timeout,
        )

        if resp.status_code == 400 and _looks_like_post_required(resp):
            resp = requests.post(
                url,
                headers=headers,
                params=params,
                data=data or {},
                timeout=timeout,
            )

        return _parse_or_raise(resp)

    raise ValueError("method must be GET or POST")



def _truncate_str(s: str, max_len: int) -> str:
    if len(s) <= max_len:
        return s
    return s[: max_len - 3] + "..."

def _format_value(
    v: Any,
    *,
    max_str_len: int = 160,
    max_list_items: int = 6,
    max_dict_keys: int = 6,
    _depth: int = 0,
    _max_depth: int = 2,
) -> str:
    # Prevent overly-deep nesting in prints
    if _depth >= _max_depth:
        return '"..."'

    if isinstance(v, dict):
        items = list(v.items())
        shown = items[:max_dict_keys]
        parts = []
        for k, vv in shown:
            k_json = json.dumps(str(k))
            vv_json = _format_value(
                vv,
                max_str_len=max_str_len,
                max_list_items=max_list_items,
                max_dict_keys=max_dict_keys,
                _depth=_depth + 1,
                _max_depth=_max_depth,
            )
            parts.append(f"{k_json}: {vv_json}")
        if len(items) > max_dict_keys:
            parts.append('"...": "..."')
        return "{ " + ", ".join(parts) + " }"

    if isinstance(v, (list, tuple)):
        items = list(v)
        shown = items[:max_list_items]
        parts = [
            _format_value(
                vv,
                max_str_len=max_str_len,
                max_list_items=max_list_items,
                max_dict_keys=max_dict_keys,
                _depth=_depth + 1,
                _max_depth=_max_depth,
            )
            for vv in shown
        ]
        if len(items) > max_list_items:
            parts.append('"..."')
        return "[ " + ", ".join(parts) + " ]"

    # Scalars / everything else
    try:
        s = json.dumps(v, ensure_ascii=False)
    except TypeError:
        s = json.dumps(str(v), ensure_ascii=False)

    s = _truncate_str(s, max_str_len)
    return s


def print_key_vals(records, num_keys: int = 10, num_records: int = 5, *, show_record_index: bool = True):
    """
    Prints records in a JSON-like style.
    - Shows only the first `num_keys` keys per record; prints "..." if more keys exist.
    - Shows only the first `num_records` records; prints "..." if more records exist.
    - Values are abbreviated (nested dicts/lists, long strings) with "...".
    """
    if not isinstance(records, list):
        records = [records]

    total_records = len(records)
    shown_records = records[:num_records]

    for i, rec in enumerate(shown_records):
        if show_record_index:
            print(f"Record {i}:")
        if not isinstance(rec, dict):
            print(_format_value(rec))
            print()
            continue

        keys = list(rec.keys())
        shown_keys = keys[:num_keys]

        print("{")
        for idx, k in enumerate(shown_keys):
            v = rec.get(k)
            k_json = json.dumps(str(k), ensure_ascii=False)
            v_json = _format_value(v)
            comma = "," if idx < len(shown_keys) - 1 else ""
            print(f"  {k_json}: {v_json}{comma}")

        if len(keys) > num_keys:
            print('  "..." : "..."')

        print("}")
        print()

    if total_records > num_records:
        print(f"... ({total_records - num_records} more records)")

            

# Read Records

In [3]:
def list_all_packages(
    *,
    page_size: int = 100,
    start: int = 0,
    include_private: bool = False,
    api_key: str | None = None,
) -> list[dict]:
    """
    List ALL packages (datasets) using pagination via `package_search`.

    Returns a list of dataset dicts (as returned by CKAN).

    Pagination:
      - Uses `package_search` with `rows` (page size) and `start` (offset).

    Auth:
      - Public datasets can be listed without an API key.
      - If include_private=True, an API key is required and must have permission
        to view private datasets.
    """
    # --- input validation ---
    if page_size < 1:
        raise ValueError("page_size must be >= 1")
    if start < 0:
        raise ValueError("start must be >= 0")
    if include_private and not api_key:
        raise ValueError("include_private=True requires a non-empty api_key")

    all_results: list[dict] = []
    offset = start

    # --- pagination loop ---
    while True:
        params: Dict[str, Any] = {
            "q": "*:*",
            "rows": page_size,
            "start": offset,
        }

        try:
            payload = ckan_action(
                "package_search",
                method="GET",
                params=params,
                api_key=api_key if include_private else None,
            )
        except RuntimeError as e:
            # Provide a more actionable message for common auth failures
            msg = str(e)
            if include_private and ("403" in msg or "Not authorized" in msg or "authorization" in msg.lower()):
                raise PermissionError(
                    "CKAN rejected the request. Your api_key may be missing, invalid, "
                    "or lack permission to view private datasets."
                ) from e
            raise

        result = payload.get("result") or {}
        results = result.get("results") or []
        count_raw = result.get("count", 0)

        # Defensive parsing
        try:
            count = int(count_raw)
        except (TypeError, ValueError) as e:
            raise RuntimeError(f"Unexpected 'count' value from CKAN: {count_raw!r}") from e

        # Append and advance
        all_results.extend(results)
        offset += len(results)

        # Stop when we've fetched everything, or when CKAN returns no more results
        if not results or offset >= count:
            break

    return all_results


In [4]:
records = list_all_packages()
print_key_vals(records, num_records=2)

Record 0:
{
  "author": "[{\"author_affiliation\": \"The University of Melbourne\", \"author_affiliation_identifier\": \"https://ror.org/01ej9dk98\", \"author_affiliation_identifier...,
  "author_email": null,
  "citation": "G. Beardsmore (CZO) Heat Needle Data. ",
  "creator_user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "credit": "",
  "declaration": "declaration_accepted",
  "deposit_date": "2025-06-20 04:10:39.055978",
  "doi": "10.82669/ncci8koo",
  "elevation": "",
  "embargo_date": ""
  "..." : "..."
}

Record 1:
{
  "author": "[{\"author_affiliation\": \"Commonwealth Scientific and Industrial Research Organisation\", \"author_affiliation_identifier\": \"https://ror.org/03qn8fb07\"...,
  "author_email": null,
  "citation": "Stu WoodmanTest ISO19115 001. ",
  "creator_user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "credit": "",
  "declaration": "declaration_accepted",
  "deposit_date": "2024-06-24 01:27:41.145582",
  "doi": "10.82669/cbqpep5i",
  "elevation": "",
  "emb

# Get Record by Title

In [5]:
def get_package_by_title(
    title: str,
    *,
    api_key: Optional[str] = None,
    include_private: bool = False,
    rows: int = 10,
) -> dict:
    """
    Find a package (dataset) by its *title* (human-readable), then return the best match.

    Strategy:
    - Uses `package_search` with a quoted title query to prefer exact phrase matches.
    - If multiple results match, prefers exact (case-insensitive) title equality.
    - Returns the dataset dict (as in `package_search` results).

    Notes:
    - Titles are not guaranteed unique in CKAN. If your instance allows duplicates,
      this will return the "best" match, not necessarily unique.
    - For guaranteed lookup, use the dataset `name`/`id` with `package_show`.
    """
    if not title or not title.strip():
        raise ValueError("title must be a non-empty string")
    if include_private and not api_key:
        raise ValueError("include_private=True requires a non-empty api_key")
    if rows < 1:
        raise ValueError("rows must be >= 1")

    # Quoted phrase search; escape quotes to keep query valid
    phrase = title.strip().replace('"', '\\"')
    q = f'title:"{phrase}"'

    payload = ckan_action(
        "package_search",
        method="GET",
        params={"q": q, "rows": rows, "start": 0},
        api_key=api_key if include_private else None,
    )

    result = payload.get("result") or {}
    results: list[dict] = result.get("results") or []

    if not results:
        raise LookupError(f"No package found with title matching: {title!r}")

    # Prefer exact title match (case-insensitive)
    title_norm = title.strip().casefold()
    exact = [r for r in results if (r.get("title") or "").strip().casefold() == title_norm]
    return exact[0] if exact else results[0]


In [7]:
record = get_package_by_title(title="Test Instrument 02")
print_key_vals(record)

# Get API Token Names

In [5]:
def get_user_id(*, username: str, api_key: str) -> str:
    """
    Resolve a CKAN user id from a username using user_show.
    Requires api_key.
    """
    if not username or not username.strip():
        raise ValueError("username is required")
    if not api_key or not api_key.strip():
        raise ValueError("api_key is required")

    payload = ckan_action(
        "user_show",
        method="GET",
        params={"id": username.strip()},
        api_key=api_key,
    )
    user = payload.get("result") or {}
    user_id = user.get("id")
    if not user_id:
        raise RuntimeError("Could not resolve user id from user_show response.")
    return user_id

def list_api_tokens(*, api_key: str, username: str) -> list[dict]:
    """
    List API tokens for the given user_id.
    Requires api_key and user_id (your CKAN requires user_id).
    """
    user_id = get_user_id(username=username, api_key=api_key)
    if not api_key or not api_key.strip():
        raise ValueError("api_key is required for api_token_list")
    if not user_id or not user_id.strip():
        raise ValueError("user_id is required for api_token_list (your CKAN instance enforces this)")

    payload = ckan_action(
        "api_token_list",
        method="GET",
        params={"user_id": user_id.strip()},
        api_key=api_key,
    )

    result = payload.get("result")
    if not isinstance(result, list):
        raise RuntimeError(f"Unexpected api_token_list result type: {type(result).__name__}")
    return result

In [6]:
res = list_api_tokens(username= "ckan_admin", api_key=CKAN_API_KEY)
print_key_vals(res, num_keys=12, num_records=min(5, len(res)))

Record 0:
{
  "id": "CgrDv9EuiupEdjFuCN96thMtcvENRu4u7pdwnX3GrTY",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-06T00:43:54.909777",
  "last_access": null
}

Record 1:
{
  "id": "RQnRrSFj-1FsRXlEAtIN70mW0e1u7E2WyfFaOp5baRU",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-19T04:32:23.529843",
  "last_access": "2024-03-19T10:39:59.324042"
}

Record 2:
{
  "id": "yirdop13NfLEz1PP1dfI_u9DsdXn3O1ZvsSDr5PbxWs",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-06T02:43:28.140492",
  "last_access": "2024-03-06T03:04:02.927656"
}

Record 3:
{
  "id": "H_4MQFYxHe7wAD86qJUNwvQv9ar7c_7ODAL1SkeTQkc",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-06T03:11:48.140155",
  "last_access": null
}

Record 4:
{
  "id": "W3I9JPvzDfJuCHxfOiF_aoCWZHmwQunYBUiUNHTTksk",
  "name": "datapusher",
 

# Revoke API Tokens

In [7]:
def revoke_api_tokens(
    *,
    api_key: str,
    username: str,
    token_name: Optional[str] = None,
    dry_run: bool = False,
    limit: int = 10
) -> dict:
    """
    Revoke (delete) CKAN API tokens for a user (by username).

    Your CKAN returns tokens with fields: id, name, user_id, created_at, last_access
    and api_token_revoke expects payload: {"token": <token_value>}.
    We use `id` as the token value to revoke (common CKAN behavior).

    Args:
      api_key: required (Authorization header)
      username: required
      token_name: if provided, revoke only tokens with this exact name (no prompt)
      dry_run: if True, don't revoke—just show what would be revoked

    Returns:
      dict summary with per-token status.
    """
    if not api_key or not api_key.strip():
        raise ValueError("api_key is required")
    if not username or not username.strip():
        raise ValueError("username is required")

    tokens = list_api_tokens(api_key=api_key, username=username.strip())

    if token_name is not None:
        name = token_name.strip()
        if not name:
            raise ValueError("token_name cannot be empty/whitespace")
        targets = [t for t in tokens if (t.get("name") == name)]
        if not targets:
            raise LookupError(f"No tokens found with name: {token_name!r}")
    else:
        targets = tokens

    if not targets:
        return {"revoked": 0, "skipped": 0, "details": [], "message": "No tokens to revoke."}

    print(f"User: {username.strip()}")
    print(f"Tokens found: {len(tokens)}")
    print(f"Tokens selected for revocation: {len(targets)}")
    print_key_vals(targets, num_keys=12, num_records=min(10, len(targets)))

    if token_name is None and not dry_run:
        confirm = input(
            "You are about to REVOKE ALL tokens listed above. Type 'REVOKE ALL' to continue: "
        ).strip()
        if confirm != "REVOKE ALL":
            print("Cancelled. No tokens were revoked.")
            return {"revoked": 0, "skipped": len(targets), "details": [], "message": "Cancelled by user."}

    if dry_run:
        print("Dry run enabled. No tokens were revoked.")
        return {
            "revoked": 0,
            "skipped": len(targets),
            "details": [{"id": t.get("id"), "name": t.get("name")} for t in targets],
            "message": "Dry run only.",
        }

    details = []
    revoked = 0
    skipped = 0

    for i, t in enumerate(targets):
        token_value = t.get("id")  # CKAN expects field name "token" in revoke payload
        if not token_value:
            skipped += 1
            details.append({"id": None, "name": t.get("name"), "status": "skipped", "reason": "missing id"})
            continue

        try:
            ckan_action(
                "api_token_revoke",
                method="POST",
                data={"token": token_name, "jti": token_value},
                api_key=api_key,
            )
            revoked += 1
            details.append({"id": token_value, "name": t.get("name"), "status": "revoked"})
        except Exception as e:
            msg = str(e)
            # common permission-ish signals
            if "403" in msg or "Not authorized" in msg or "not authorized" in msg.lower():
                details.append({
                    "id": token_value,
                    "name": t.get("name"),
                    "status": "failed",
                    "error": "Not authorized (403). You may need to be the token owner or a sysadmin.",
                })
            else:
                details.append({"id": token_value, "name": t.get("name"), "status": "failed", "error": msg})
            skipped += 1

        if (limit is not None and i == limit):
            break

    print(f"Done. Revoked={revoked}, Failed/Skipped={skipped}")
    return {"revoked": revoked, "skipped": skipped, "details": details}


In [8]:
revoke_api_tokens(
    api_key=CKAN_API_KEY,
    username='ckan_admin',
    token_name='datapusher',
    limit=None
)


User: ckan_admin
Tokens found: 1625
Tokens selected for revocation: 1620
Record 0:
{
  "id": "CgrDv9EuiupEdjFuCN96thMtcvENRu4u7pdwnX3GrTY",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-06T00:43:54.909777",
  "last_access": null
}

Record 1:
{
  "id": "RQnRrSFj-1FsRXlEAtIN70mW0e1u7E2WyfFaOp5baRU",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-19T04:32:23.529843",
  "last_access": "2024-03-19T10:39:59.324042"
}

Record 2:
{
  "id": "yirdop13NfLEz1PP1dfI_u9DsdXn3O1ZvsSDr5PbxWs",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-06T02:43:28.140492",
  "last_access": "2024-03-06T03:04:02.927656"
}

Record 3:
{
  "id": "H_4MQFYxHe7wAD86qJUNwvQv9ar7c_7ODAL1SkeTQkc",
  "name": "datapusher",
  "user_id": "8df5712a-8480-48e2-b1f4-397c9cc48a3f",
  "created_at": "2024-03-06T03:11:48.140155",
  "last_access": null
}

Record 4:
{
  "id":

{'revoked': 1620,
 'skipped': 0,
 'details': [{'id': 'CgrDv9EuiupEdjFuCN96thMtcvENRu4u7pdwnX3GrTY',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'RQnRrSFj-1FsRXlEAtIN70mW0e1u7E2WyfFaOp5baRU',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'yirdop13NfLEz1PP1dfI_u9DsdXn3O1ZvsSDr5PbxWs',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'H_4MQFYxHe7wAD86qJUNwvQv9ar7c_7ODAL1SkeTQkc',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'W3I9JPvzDfJuCHxfOiF_aoCWZHmwQunYBUiUNHTTksk',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'eRVQB-Nk67qxRvOSIEqGk9iAsw6gpaNyGpQfmCfpPC4',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'uqXyMZBFQgcHavu1w6xuXcfD0TZ3bM_SwbufKB1eIvo',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'Y7wxDvB5rBT1MZjiyy7nAe4chfFr6s7aIO4-cOtHu0s',
   'name': 'datapusher',
   'status': 'revoked'},
  {'id': 'DsoIYiJTtVbf33aoQz9yrR0YFBNz69Vfs0_Nfx1NY-U',
   'name': 'datapusher',
   'status': 'revoked'},
  

<center><h1>Advanced</h1></center>

# DataStore

Push a resource (e.g. attached csv file) to datapusher

In [25]:
def datapusher_submit(*, api_key: str, resource_id: str) -> dict:
    """
    Trigger CKAN DataPusher to (re)submit a resource for processing.

    Equivalent to:
      POST /api/3/action/datapusher_submit
      JSON body: {"resource_id": "<id>"}
      Header: Authorization: <api_key>

    Notes:
    - Typically requires a sysadmin/admin API key depending on CKAN/DataPusher config.
    """
    if not api_key or not api_key.strip():
        raise ValueError("api_key is required (often sysadmin/admin for datapusher_submit)")
    if not resource_id or not resource_id.strip():
        raise ValueError("resource_id is required")

    payload = ckan_action(
        "datapusher_submit",
        method="POST",
        data={"resource_id": resource_id.strip()},
        api_key=api_key,
    )
    return payload


# Example usage:
# result = datapusher_submit(api_key=CKAN_API_KEY, resource_id="xxxxxxxxxxxxxx")
# print_key_vals(result, num_keys=20, num_records=1)


In [29]:
result = datapusher_submit(api_key=CKAN_API_KEY, resource_id="0fa3fb1b-857b-44d5-91d7-46d510af627f")
print_key_vals(result, num_keys=20, num_records=1)

Record 0:
{
  "help": "https://instrument-test.data.auscope.org.au/api/3/action/help_show?name=datapusher_submit",
  "success": true,
  "result": false
}

