diff --git a/docs/docs/cogstack-ce/tutorial/end-to-end-jupyterhub.md b/docs/docs/cogstack-ce/tutorial/end-to-end-jupyterhub.md index 35b3402..f0b53f2 100644 --- a/docs/docs/cogstack-ce/tutorial/end-to-end-jupyterhub.md +++ b/docs/docs/cogstack-ce/tutorial/end-to-end-jupyterhub.md @@ -13,7 +13,10 @@ By the end, you will have completed an end-to-end user flow: !!! tip The following tutorial will use your CogStack CE installation and let you run real code against your environment. - To see a non-interactive version of the tutorial notebook, refer to [the MedCAT Service Tutorial notebook](../../platform/cogstack-ai/medcat-service-tutorial.ipynb). + To see a non-interactive version of the tutorial notebook, refer to: + + - [the MedCAT Service Tutorial notebook](../../platform/cogstack-ai/medcat-service-tutorial.ipynb). + - [the OpenSearch E2E notebook](./medcat-opensearch-e2e.ipynb). ## Before you start @@ -46,11 +49,12 @@ Log in with: After login, JupyterLab opens for your user. -## Step 3: Open the bundled notebook +## Step 3: Open the bundled notebooks -The chart includes an example notebook: +The chart includes example notebooks to interact with CogStack CE: - `medcat-service-tutorial.ipynb` +- `medcat-opensearch-e2e.ipynb` You can open it directly: @@ -62,10 +66,11 @@ Or navigate to it in JupyterLab and click to open it. Run each cell in order from top to bottom. -The notebook demonstrates service calls to: +The notebooks demonstrates service calls to: - `medcat-service` at `/api/process` for named entity extraction - `anoncat-service` at `/api/process` for de-identification +- `OpenSearch` for indexing and searching data. It uses environment variables for service URLs where available, so the default CogStack CE setup should work without edits. @@ -83,13 +88,12 @@ If those outputs appear, you have validated the full end-to-end flow from Jupyte - If JupyterHub does not load, ensure port-forwarding is running. - If notebook requests fail, verify the cluster services are up and re-run: - - `helm get notes | bash` -- For production deployments, replace dummy authentication with secure auth configuration. +- For production deployments, replace dummy authentication with secure auth configuration. ## Next Steps - See the [full deployment documentation](../../platform/deployment/_index.md) for more details on scaling, production security, and advanced configuration options. - See full install instructions of the cogstack CE chart[CogStack CE Helm chart (install + customization)](../../platform/deployment/helm/charts/cogstack-ce-helm.md) -- See further tutorials on medcat on [GitHub](https://github.com/CogStack/cogstack-nlp/tree/79f00cfc204f4ae559b56c8e397bbcaf82d44274/medcat-v2-tutorials) \ No newline at end of file +- See further tutorials on medcat on [GitHub](https://github.com/CogStack/cogstack-nlp/tree/79f00cfc204f4ae559b56c8e397bbcaf82d44274/medcat-v2-tutorials) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index d24429b..dfbdbfd 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -39,7 +39,8 @@ nav: - Overview: cogstack-ce/_index.md - Quickstart: cogstack-ce/tutorial/quickstart-installation.md - Tutorials: - - End To End Tutorial: cogstack-ce/tutorial/end-to-end-jupyterhub.md + - Using JupyterHub: cogstack-ce/tutorial/end-to-end-jupyterhub.md + - End To End Tutorial: cogstack-ce/tutorial/medcat-opensearch-e2e.ipynb - CogStack AI: - CogStack AI: cogstack-ai/index.md - Natural Language Processing: overview/Natural Language Processing.md diff --git a/docs/scripts/copy_files_from_repo.py b/docs/scripts/copy_files_from_repo.py index 014eb96..2a36454 100644 --- a/docs/scripts/copy_files_from_repo.py +++ b/docs/scripts/copy_files_from_repo.py @@ -24,7 +24,11 @@ { "sourceFilePath": "helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb", "outputFilePath": "platform/cogstack-ai/medcat-service-tutorial.ipynb", - } + }, + { + "sourceFilePath": "helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-opensearch-e2e.ipynb", + "outputFilePath": "cogstack-ce/tutorial/medcat-opensearch-e2e.ipynb", + }, ] @@ -46,4 +50,3 @@ def main() -> None: main() - diff --git a/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-opensearch-e2e.ipynb b/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-opensearch-e2e.ipynb new file mode 100644 index 0000000..845c943 --- /dev/null +++ b/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-opensearch-e2e.ipynb @@ -0,0 +1,707 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a7844791", + "metadata": {}, + "source": [ + "# MedCAT + OpenSearch End-to-End Demo\n", + "\n", + "This is a short, practical walkthrough showing how to turn one clinical note into searchable concepts.\n", + "\n", + "It is an end to end example of how you can use CogStack to unlock the power of your healthcare data.\n", + "\n", + "## Overview\n", + "\n", + "### Who this is for\n", + "This is for developers, data engineers, and analysts who want to see a practical example of how CogStack, MedCAT and Opensearch can be integrated to let you perform advanced search on your notes. \n", + "\n", + "### What this notebook does\n", + "1. Index one sample note into `discharge`\n", + "2. Search that note back using free text\n", + "3. Search that note back even when we have typos\n", + "4. Perform Named Entity Resolution (NER) by calling MedCAT Service and index them\n", + "5. Search notes by concept\n", + "\n", + "The goal is to show that this process is straightforward: call one API, index results, and query them.\n", + "\n", + "### Prerequisites\n", + "The best way to run this notebook interactively is to run the CogStack Community Edition with Helm. Look at https://docs.cogstack.org/ to get started." + ] + }, + { + "cell_type": "markdown", + "id": "875a64db", + "metadata": {}, + "source": [ + "## Initialisation: Define the inputs and services\n", + "\n", + "### Input Data\n", + "We define a short input for this tutorial. This represents your free text patient data, for example a discharge summary or long doctors note.\n", + "\n", + "The sample sentence contains concepts that the example demo packs used by medcat service have been trained for. \n", + "\n", + "### Service definitions\n", + "We will setup a client for OpenSearch, and define the HTTP endpoint for medcat service.\n", + "\n", + "If using the cogstack community edition helm chart, these should all be setup for you automatically using kubernetes services and env vars. Otherwise change these accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "17deaa5c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/py311/lib/python3.11/site-packages/opensearchpy/connection/http_urllib3.py:214: UserWarning: Connecting to https://opensearch-cluster-master:9200 using SSL with verify_certs=False is insecure.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import os\n", + "from datetime import datetime, timezone\n", + "from urllib.parse import urlparse\n", + "\n", + "import pandas as pd\n", + "import requests\n", + "import urllib3\n", + "from IPython.display import display\n", + "from opensearchpy import OpenSearch\n", + "\n", + "# The sample note that we will work with\n", + "sample_text = \"John was diagnosed with Kidney Failure\"\n", + "\n", + "# Service URLs from environment variables\n", + "medcat_base_url = os.getenv(\"MEDCAT_URL\", \"http://cogstack-medcat-service:5000\").rstrip(\"/\")\n", + "medcat_url = medcat_base_url + \"/api/process\"\n", + "\n", + "opensearch_url = os.getenv(\"OPENSEARCH_URL\", \"https://opensearch-cluster-master:9200\")\n", + "opensearch_username = os.getenv(\"OPENSEARCH_USERNAME\", \"admin\")\n", + "opensearch_password = os.getenv(\"OPENSEARCH_PASSWORD\", \"opensearch-312$A\")\n", + "\n", + "parsed = urlparse(opensearch_url)\n", + "\n", + "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", + "\n", + "host_cfg = {\n", + " \"host\": parsed.hostname,\n", + " \"port\": parsed.port or (443 if parsed.scheme == \"https\" else 80),\n", + "}\n", + "if parsed.path and parsed.path != \"/\":\n", + " host_cfg[\"url_prefix\"] = parsed.path.lstrip(\"/\")\n", + "\n", + "client = OpenSearch(\n", + " hosts=[host_cfg],\n", + " http_auth=(opensearch_username, opensearch_password),\n", + " use_ssl=(parsed.scheme == \"https\"),\n", + " verify_certs=False,\n", + ")\n", + "\n", + "# Hardcoded demo indices\n", + "discharge_index = \"discharge\"\n", + "annotations_index = \"discharge_annotations\"\n", + "\n", + "# Static demo note id used across all steps\n", + "note_id = \"demo-note-kidney-failure-001\"" + ] + }, + { + "cell_type": "markdown", + "id": "19340174", + "metadata": {}, + "source": [ + "## 1) Index the note into OpenSearch\n", + "\n", + "We write the note into `discharge`, then immediately run a free-text query (`kidney failure`) to prove it is searchable." + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "3eb85731", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_index': 'discharge',\n", + " '_id': 'demo-note-kidney-failure-001',\n", + " '_version': 10,\n", + " 'result': 'updated',\n", + " 'forced_refresh': True,\n", + " '_shards': {'total': 2, 'successful': 1, 'failed': 0},\n", + " '_seq_no': 1009,\n", + " '_primary_term': 1}" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "note_doc = {\n", + " \"note_id\": note_id,\n", + " \"subject_id\": subject_id,\n", + " \"text\": sample_text,\n", + " \"storetime\": datetime.now(timezone.utc).strftime(\"%Y-%m-%d %H:%M:%S\"),\n", + "}\n", + "\n", + "client.index(index=discharge_index, id=note_id, body=note_doc, refresh=True)\n" + ] + }, + { + "cell_type": "markdown", + "id": "edad8661", + "metadata": {}, + "source": [ + "## 2) Search that note back using free text\n", + "\n", + "This query uses `match` search, so we can find notes by important words (for example `John kidney`) without requiring an exact full-string match.\n", + "\n", + "In a traditional relational query, you would usually rely on exact equality or simple wildcard `LIKE` patterns. Here, OpenSearch handles tokenized full-text search for us." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "25613475", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Free-text query used: John kidney\n", + "This still returns the note even though it is not an exact full sentence match.\n", + "Results from OpenSearch free-text search:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
note_idsubject_idtextstoretime
0demo-note-kidney-failure-0011John was diagnosed with Kidney Failure2026-03-26 17:49:59
\n", + "
" + ], + "text/plain": [ + " note_id subject_id \\\n", + "0 demo-note-kidney-failure-001 1 \n", + "\n", + " text storetime \n", + "0 John was diagnosed with Kidney Failure 2026-03-26 17:49:59 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query_text = \"John kidney\"\n", + "free_text_resp = client.search(\n", + " index=discharge_index,\n", + " body={\"query\": {\"match\": {\"text\": query_text}}},\n", + ")\n", + "hits = free_text_resp[\"hits\"][\"hits\"]\n", + "print(f\"Free-text query used: {query_text}\")\n", + "print(\"This still returns the note even though it is not an exact full sentence match.\")\n", + "print(\"Results from OpenSearch free-text search:\")\n", + "display(pd.DataFrame([hits[0][\"_source\"]]))" + ] + }, + { + "cell_type": "markdown", + "id": "c8aaa960", + "metadata": {}, + "source": [ + "## 3) Fuzzy full-text search (not exact matching)\n", + "\n", + "Now we intentionally misspell the query (`kidny falur`) and still retrieve results.\n", + "\n", + "This demonstrates why OpenSearch is useful for user-entered text and typo-tolerant retrieval." + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "595a313e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fuzzy query: kidny falur\n", + "fuzzy_hits=1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
note_idsubject_idtextstoretime
0demo-note-kidney-failure-0011John was diagnosed with Kidney Failure2026-03-26 17:49:59
\n", + "
" + ], + "text/plain": [ + " note_id subject_id \\\n", + "0 demo-note-kidney-failure-001 1 \n", + "\n", + " text storetime \n", + "0 John was diagnosed with Kidney Failure 2026-03-26 17:49:59 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fuzzy_query = \"kidny falur\"\n", + "fuzzy_resp = client.search(\n", + " index=discharge_index,\n", + " body={\n", + " \"query\": {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": fuzzy_query,\n", + " \"fuzziness\": \"AUTO\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + ")\n", + "\n", + "fuzzy_hits = fuzzy_resp[\"hits\"][\"hits\"]\n", + "print(f\"Fuzzy query: {fuzzy_query}\")\n", + "print(f\"fuzzy_hits={len(fuzzy_hits)}\")\n", + "display(pd.DataFrame(pd.DataFrame([hits[0][\"_source\"]])))" + ] + }, + { + "cell_type": "markdown", + "id": "690cd9a2", + "metadata": {}, + "source": [ + "## 4) Perform Named Entity Resolution with MedCAT\n", + "\n", + "We can see that we are able to search with free text, and fuzzy match. However, what happens if we want to search accross notes using common terminology?\n", + "\n", + "We can solve this by using named entity resolution (NER) and NLP.\n", + "\n", + "To do this we will call MedCAT at `/api/process` with the same note text.\n", + "\n", + "MedCAT returns structured entities (for example CUI and concept name). This is named entity resolution in one API call." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "f1428daf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "annotations_found=1\n", + "Results from MedCAT named entity extraction:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pretty_namecuitype_idssource_valuedetected_nameacccontext_similaritystartendidmeta_annscontext_leftcontext_centercontext_right
0Kidney Failure1[T047]Kidney Failurekidney~failure1124380{}[][][]
\n", + "
" + ], + "text/plain": [ + " pretty_name cui type_ids source_value detected_name acc \\\n", + "0 Kidney Failure 1 [T047] Kidney Failure kidney~failure 1 \n", + "\n", + " context_similarity start end id meta_anns context_left context_center \\\n", + "0 1 24 38 0 {} [] [] \n", + "\n", + " context_right \n", + "0 [] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "medcat_payload = {\"content\": {\"text\": sample_text}}\n", + "medcat_result = requests.post(medcat_url, json=medcat_payload, timeout=30).json()\n", + "raw_annotations = medcat_result.get(\"result\", {}).get(\"annotations\", [])\n", + "\n", + "annotations = [\n", + " next(iter(ann.values())) if isinstance(ann, dict) and len(ann) == 1 else ann\n", + " for ann in raw_annotations\n", + " if isinstance(ann, dict)\n", + "]\n", + "\n", + "print(f\"annotations_found={len(annotations)}\")\n", + "print(\"Results from MedCAT named entity extraction:\")\n", + "display(pd.DataFrame(annotations))" + ] + }, + { + "cell_type": "markdown", + "id": "1dc6dbf6", + "metadata": {}, + "source": [ + "### 4.1) Index MedCAT entities into OpenSearch\n", + "\n", + "Here we take each MedCAT entity and store it in OpenSearch in the `discharge_annotations` index\n", + "\n", + "We prefix MedCAT fields with `nlp.` and add `meta.note_id` / `meta.subject_id` so each entity stays linked to its source note." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "86152c7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "indexed_annotations=1\n" + ] + } + ], + "source": [ + "indexed = 0\n", + "now_ts = datetime.now(timezone.utc).isoformat()\n", + "\n", + "for i, ann in enumerate(annotations):\n", + " nlp_fields = {f\"nlp.{k}\": v for k, v in ann.items()}\n", + "\n", + " ann_doc = {\n", + " **nlp_fields,\n", + " \"meta.note_id\": note_id,\n", + " \"meta.subject_id\": subject_id,\n", + " \"timestamp\": now_ts,\n", + " }\n", + "\n", + " client.index(\n", + " index=annotations_index,\n", + " id=f\"{note_id}-ann-{i}\",\n", + " body=ann_doc,\n", + " refresh=False,\n", + " )\n", + " indexed += 1\n", + "\n", + "client.indices.refresh(index=annotations_index)\n", + "print(f\"indexed_annotations={indexed}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f0067cc1", + "metadata": {}, + "source": [ + "## 5) Search by concept\n", + "\n", + "Finally, we query `discharge_annotations` using the extracted concept (`nlp.cui` / `nlp.pretty_name`).\n", + "\n", + "This is the main value: instead of searching raw strings, we can retrieve notes by normalized clinical concepts." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "3a6edeb8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concept CUI search used: 1\n", + "concept_hits=1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nlp.pretty_namenlp.cuinlp.type_idsnlp.source_valuenlp.detected_namenlp.accnlp.context_similaritynlp.startnlp.endnlp.idnlp.meta_annsnlp.context_leftnlp.context_centernlp.context_rightmeta.note_idmeta.subject_idtimestamp
0Kidney Failure1[T047]Kidney Failurekidney~failure1124380{}[][][]demo-note-kidney-failure-00112026-03-26T17:50:17.140165+00:00
\n", + "
" + ], + "text/plain": [ + " nlp.pretty_name nlp.cui nlp.type_ids nlp.source_value nlp.detected_name \\\n", + "0 Kidney Failure 1 [T047] Kidney Failure kidney~failure \n", + "\n", + " nlp.acc nlp.context_similarity nlp.start nlp.end nlp.id nlp.meta_anns \\\n", + "0 1 1 24 38 0 {} \n", + "\n", + " nlp.context_left nlp.context_center nlp.context_right \\\n", + "0 [] [] [] \n", + "\n", + " meta.note_id meta.subject_id \\\n", + "0 demo-note-kidney-failure-001 1 \n", + "\n", + " timestamp \n", + "0 2026-03-26T17:50:17.140165+00:00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "concept_cui = str(annotations[0].get(\"cui\", \"\"))\n", + "\n", + "concept_query = {\n", + " \"query\": {\n", + " \"term\": {\n", + " \"nlp.cui.keyword\": concept_cui\n", + " }\n", + " }\n", + "}\n", + "\n", + "concept_resp = client.search(index=annotations_index, body=concept_query)\n", + "concept_hits = concept_resp[\"hits\"][\"hits\"]\n", + "\n", + "print(f\"Concept CUI search used: {concept_cui}\")\n", + "print(f\"concept_hits={len(concept_hits)}\")\n", + "\n", + "display(pd.DataFrame([h.get(\"_source\", {}) for h in concept_hits]))" + ] + }, + { + "cell_type": "markdown", + "id": "28cf3ed2", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "You have now seen the full end-to-end CogStack flow in a few simple steps:\n", + "- index notes into OpenSearch\n", + "- run free-text and fuzzy search over clinical text\n", + "- call MedCAT to perform named entity resolution\n", + "- index entity outputs\n", + "- retrieve notes by normalized concept (CUI)\n", + "\n", + "This is the core building block for turning unstructured clinical text into searchable, analysable, and operational data.\n", + "\n", + "## What to do next\n", + "- **Visualise the data with OpenSearch Dashboards** \n", + " If you've setup with the CogStack Community Edition and are running on localhost, visit http://localhost:5601/ to now see reports and drill down on this data with the UI\n", + "\n", + "- **Scale this into production ETL** \n", + " Use these exact blocks in your pipelines: ingest note text -> index to OpenSearch -> call MedCAT -> index annotations -> query/serve downstream applications.\n", + "\n", + "- **Use a real MedCAT model** \n", + " Replace the demo model with a domain-appropriate model pack and configuration: [MedCAT v2 README](https://github.com/CogStack/cogstack-nlp/blob/main/medcat-v2/README.md).\n", + "\n", + "- **Explore the platform docs and examples** \n", + " See full docs at [docs.cogstack.org](https://docs.cogstack.org/) and repositories/examples at [github.com/CogStack](https://github.com/CogStack).\n", + "\n", + "- **Add supervised learning with MedCAT Trainer (MLOps)** \n", + " Set up a training and feedback loop to improve extraction quality over time using MedCAT Trainer (annotation -> train -> evaluate -> redeploy)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb b/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb index 09124d7..da883e9 100644 --- a/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb +++ b/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb @@ -10,16 +10,21 @@ "This is a step-by-step walkthrough that shows how to call two CogStack services over HTTP:\n", "MedCAT (entity extraction) and AnonCAT (de-identification).\n", "\n", - "## Who it is for:\n", + "## Overview\n", + "\n", + "### Who it is for:\n", "This is for developers, data engineers, and analysts who want a quick, practical example of how\n", "to integrate MedCAT/AnonCAT into a Python workflow (and later into a notebook-based analysis).\n", "\n", - "## What it will do:\n", + "### What it will do:\n", "1) Define a sample clinical sentence and the service URLs.\n", "2) Extract Entities, by calling the medcat-service API\n", "3) Print the extracted entity annotations from the MedCAT response.\n", "4) Deidentify text by calling the anoncat-service API\n", - "5) Print the de-identified text (and show the full JSON response for inspection).\n" + "5) Print the de-identified text (and show the full JSON response for inspection).\n", + "\n", + "### Prerequisites\n", + "The best way to run this notebook interactively is to run the CogStack Community Edition with Helm. Look at https://docs.cogstack.org/ to get started." ] }, { @@ -37,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "a5f15845-5a0f-414e-9db7-f414d12bde48", "metadata": {}, "outputs": [], @@ -49,11 +54,11 @@ "sample_text = \"John was diagnosed with Kidney Failure\"\n", "\n", "medcat_base_url = os.getenv(\n", - " \"MEDCAT_URL\", \"http://cogstack-helm-ce-medcat-service:5000\"\n", + " \"MEDCAT_URL\", \"http://cogstack-medcat-service:5000\"\n", ").rstrip(\"/\")\n", "\n", "anoncat_base_url = os.getenv(\n", - " \"ANONCAT_URL\", \"http://cogstack-helm-ce-anoncat-service:5000\"\n", + " \"ANONCAT_URL\", \"http://cogstack-ce-anoncat-service:5000\"\n", ").rstrip(\"/\")\n", "\n", "medcat_url = medcat_base_url + \"/api/process\"\n", diff --git a/helm-charts/cogstack-helm-ce/provisioning/base_index_settings.json b/helm-charts/cogstack-helm-ce/provisioning/base_index_settings.json index 949426e..99d2b78 100644 --- a/helm-charts/cogstack-helm-ce/provisioning/base_index_settings.json +++ b/helm-charts/cogstack-helm-ce/provisioning/base_index_settings.json @@ -5,7 +5,9 @@ "emar", "icustays", "patients", - "poe" + "poe", + "discharge", + "discharge_annotations" ], "template": { "mappings": { diff --git a/helm-charts/cogstack-helm-ce/provisioning/dashboards.ndjson b/helm-charts/cogstack-helm-ce/provisioning/dashboards.ndjson index 4e241db..fedf08b 100644 --- a/helm-charts/cogstack-helm-ce/provisioning/dashboards.ndjson +++ b/helm-charts/cogstack-helm-ce/provisioning/dashboards.ndjson @@ -18,4 +18,5 @@ {"attributes":{"fields":"[{\"count\":0,\"name\":\"_id\",\"type\":\"string\",\"esTypes\":[\"_id\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_index\",\"type\":\"string\",\"esTypes\":[\"_index\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_score\",\"type\":\"number\",\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_source\",\"type\":\"_source\",\"esTypes\":[\"_source\"],\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_type\",\"type\":\"string\",\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"anchor_age\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"anchor_year\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"anchor_year_group\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"anchor_year_group.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"anchor_year_group\"}}},{\"count\":0,\"name\":\"comments\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":1,\"name\":\"dod\",\"type\":\"date\",\"esTypes\":[\"date\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"dose_val_rx\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"gen_dom\",\"type\":\"geo_point\",\"esTypes\":[\"geo_point\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"gen_loc\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"gen_loc.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"gen_loc\"}}},{\"count\":0,\"name\":\"gender\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"gender.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"gender\"}}},{\"count\":0,\"name\":\"subject_id\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"patients"},"id":"e6ff0d40-11f1-11ef-ad94-374d81247b5b","migrationVersion":{"index-pattern":"7.6.0"},"references":[],"type":"index-pattern","updated_at":"2024-05-24T17:32:47.161Z","version":"WzUwLDFd"} {"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Patient_Age","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Patient_Age\",\"type\":\"histogram\",\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"params\":{},\"schema\":\"metric\"},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"params\":{\"field\":\"anchor_age\",\"interval\":\"auto\",\"min_doc_count\":false,\"has_extended_bounds\":false,\"extended_bounds\":{\"max\":\"\",\"min\":\"\"},\"customLabel\":\"Age\"},\"schema\":\"segment\"}],\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"labels\":{\"filter\":true,\"show\":true,\"truncate\":100},\"position\":\"bottom\",\"scale\":{\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{},\"type\":\"category\"}],\"grid\":{\"categoryLines\":false},\"labels\":{\"show\":false},\"legendPosition\":\"right\",\"seriesParams\":[{\"data\":{\"id\":\"1\",\"label\":\"Count\"},\"drawLinesBetweenPoints\":true,\"lineWidth\":2,\"mode\":\"stacked\",\"show\":true,\"showCircles\":true,\"type\":\"histogram\",\"valueAxis\":\"ValueAxis-1\"}],\"thresholdLine\":{\"color\":\"#E7664C\",\"show\":false,\"style\":\"full\",\"value\":10,\"width\":1},\"times\":[],\"type\":\"histogram\",\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"labels\":{\"filter\":false,\"rotate\":0,\"show\":true,\"truncate\":100},\"name\":\"LeftAxis-1\",\"position\":\"left\",\"scale\":{\"mode\":\"normal\",\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{\"text\":\"Count\"},\"type\":\"value\"}]}}"},"id":"9babc1b0-185b-11ef-8167-970bdc38cc41","migrationVersion":{"visualization":"7.10.0"},"references":[{"id":"e6ff0d40-11f1-11ef-ad94-374d81247b5b","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2024-05-22T16:52:00.458Z","version":"WzQzLDFd"} {"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"language\":\"kuery\",\"query\":\"\"},\"filter\":[]}"},"optionsJSON":"{\"hidePanelTitles\":false,\"useMargins\":true}","panelsJSON":"[{\"version\":\"2.19.0\",\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":15,\"i\":\"f3b28bcf-f3a0-4f97-93fe-2173dd6ae800\"},\"panelIndex\":\"f3b28bcf-f3a0-4f97-93fe-2173dd6ae800\",\"embeddableConfig\":{},\"panelRefName\":\"panel_0\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":24,\"y\":0,\"w\":24,\"h\":15,\"i\":\"b745abf1-caed-4c86-9691-52debc610257\"},\"panelIndex\":\"b745abf1-caed-4c86-9691-52debc610257\",\"embeddableConfig\":{},\"panelRefName\":\"panel_1\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":0,\"y\":15,\"w\":12,\"h\":12,\"i\":\"ddb64d89-b811-41ce-92c1-25bab88c6d6b\"},\"panelIndex\":\"ddb64d89-b811-41ce-92c1-25bab88c6d6b\",\"embeddableConfig\":{},\"panelRefName\":\"panel_2\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":12,\"y\":15,\"w\":12,\"h\":12,\"i\":\"a970b0d9-dff5-4b87-bef4-1d7788e6a4b1\"},\"panelIndex\":\"a970b0d9-dff5-4b87-bef4-1d7788e6a4b1\",\"embeddableConfig\":{},\"panelRefName\":\"panel_3\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":24,\"y\":15,\"w\":12,\"h\":12,\"i\":\"b857c534-28da-4a43-9c41-e36353aa86b3\"},\"panelIndex\":\"b857c534-28da-4a43-9c41-e36353aa86b3\",\"embeddableConfig\":{},\"panelRefName\":\"panel_4\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":36,\"y\":15,\"w\":12,\"h\":12,\"i\":\"9b043ef6-98a1-4e08-9ed4-7989e7776939\"},\"panelIndex\":\"9b043ef6-98a1-4e08-9ed4-7989e7776939\",\"embeddableConfig\":{},\"panelRefName\":\"panel_5\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":0,\"y\":27,\"w\":26,\"h\":14,\"i\":\"c5b3a525-a1b8-4878-8bb1-61a852adb016\"},\"panelIndex\":\"c5b3a525-a1b8-4878-8bb1-61a852adb016\",\"embeddableConfig\":{},\"panelRefName\":\"panel_6\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":26,\"y\":27,\"w\":22,\"h\":14,\"i\":\"323fbcc4-6724-4767-8b5a-af0fe2111d16\"},\"panelIndex\":\"323fbcc4-6724-4767-8b5a-af0fe2111d16\",\"embeddableConfig\":{},\"panelRefName\":\"panel_7\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":0,\"y\":41,\"w\":24,\"h\":15,\"i\":\"b511c52f-1fd0-4391-bd3a-c34abc0c8f6e\"},\"panelIndex\":\"b511c52f-1fd0-4391-bd3a-c34abc0c8f6e\",\"embeddableConfig\":{},\"panelRefName\":\"panel_8\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":24,\"y\":41,\"w\":24,\"h\":15,\"i\":\"e88e9d91-452e-49a8-8d43-4b37b22c9444\"},\"panelIndex\":\"e88e9d91-452e-49a8-8d43-4b37b22c9444\",\"embeddableConfig\":{},\"panelRefName\":\"panel_9\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":0,\"y\":56,\"w\":24,\"h\":15,\"i\":\"55842b9a-3ae3-4313-9298-35693e14ef2a\"},\"panelIndex\":\"55842b9a-3ae3-4313-9298-35693e14ef2a\",\"embeddableConfig\":{\"vis\":null},\"panelRefName\":\"panel_10\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":24,\"y\":56,\"w\":24,\"h\":15,\"i\":\"71eeee2e-f551-4d2e-bdd6-6bf9fa578dd8\"},\"panelIndex\":\"71eeee2e-f551-4d2e-bdd6-6bf9fa578dd8\",\"embeddableConfig\":{},\"panelRefName\":\"panel_11\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":0,\"y\":71,\"w\":24,\"h\":15,\"i\":\"383dad7a-3c80-4afc-9835-443e576f7dc4\"},\"panelIndex\":\"383dad7a-3c80-4afc-9835-443e576f7dc4\",\"embeddableConfig\":{},\"panelRefName\":\"panel_12\"},{\"version\":\"2.19.0\",\"gridData\":{\"x\":24,\"y\":71,\"w\":24,\"h\":15,\"i\":\"eac18176-42d0-463d-88a5-d52cbba3f21c\"},\"panelIndex\":\"eac18176-42d0-463d-88a5-d52cbba3f21c\",\"embeddableConfig\":{},\"panelRefName\":\"panel_13\"}]","timeRestore":false,"title":"Demo Dashboard","version":1},"id":"69908e20-e7a7-11ee-99af-876c5d67a178","migrationVersion":{"dashboard":"7.9.3"},"references":[{"id":"31302c60-16bf-11ef-9606-3fa2532fcc62","name":"panel_0","type":"visualization"},{"id":"d5b67c20-16b6-11ef-9606-3fa2532fcc62","name":"panel_1","type":"visualization"},{"id":"77448ac0-16bf-11ef-9606-3fa2532fcc62","name":"panel_2","type":"visualization"},{"id":"263f9ff0-1793-11ef-8167-970bdc38cc41","name":"panel_3","type":"visualization"},{"id":"77448ac0-16bf-11ef-9606-3fa2532fcc62","name":"panel_4","type":"visualization"},{"id":"c5ba7510-1792-11ef-8167-970bdc38cc41","name":"panel_5","type":"visualization"},{"id":"b3253970-1856-11ef-8167-970bdc38cc41","name":"panel_6","type":"visualization"},{"id":"ac17f0d0-185d-11ef-8167-970bdc38cc41","name":"panel_7","type":"visualization"},{"id":"0616a360-1390-11ef-9606-3fa2532fcc62","name":"panel_8","type":"map"},{"id":"821e5fc0-0d78-11ef-a513-a5083c720401","name":"panel_9","type":"map"},{"id":"584cd8d0-e7a7-11ee-99af-876c5d67a178","name":"panel_10","type":"visualization"},{"id":"92855e10-1857-11ef-8167-970bdc38cc41","name":"panel_11","type":"visualization"},{"id":"ced096e0-1858-11ef-8167-970bdc38cc41","name":"panel_12","type":"visualization"},{"id":"9babc1b0-185b-11ef-8167-970bdc38cc41","name":"panel_13","type":"visualization"}],"type":"dashboard","updated_at":"2025-07-08T13:47:18.459Z","version":"WzEwMywyMl0="} -{"exportedCount":20,"missingRefCount":0,"missingReferences":[]} \ No newline at end of file +{"attributes":{"fields":"[{\"count\":1,\"name\":\"_id\",\"type\":\"string\",\"esTypes\":[\"_id\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_index\",\"type\":\"string\",\"esTypes\":[\"_index\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_score\",\"type\":\"number\",\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_source\",\"type\":\"_source\",\"esTypes\":[\"_source\"],\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_type\",\"type\":\"string\",\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"charttime\",\"type\":\"date\",\"esTypes\":[\"date\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"comments\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"dose_val_rx\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"hadm_id\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":1,\"name\":\"note_id\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"note_id.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"note_id\"}}},{\"count\":0,\"name\":\"note_seq\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"note_type\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"note_type.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"note_type\"}}},{\"count\":1,\"name\":\"storetime\",\"type\":\"date\",\"esTypes\":[\"date\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":2,\"name\":\"subject_id\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":2,\"name\":\"text\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"text.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"text\"}}}]","title":"discharge"},"id":"e4eb4e60-063e-11ef-a513-a5083c720401","migrationVersion":{"index-pattern":"7.6.0"},"references":[],"type":"index-pattern","updated_at":"2025-07-04T13:16:09.804Z","version":"Wzk5LDIyXQ=="} +{"attributes":{"fields":"[{\"count\":0,\"name\":\"_id\",\"type\":\"string\",\"esTypes\":[\"_id\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_index\",\"type\":\"string\",\"esTypes\":[\"_index\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_score\",\"type\":\"number\",\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_source\",\"type\":\"_source\",\"esTypes\":[\"_source\"],\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"_type\",\"type\":\"string\",\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"comments\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"dose_val_rx\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"enrich_top_level_concept\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"meta.note_id\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"meta.note_id.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"meta.note_id\"}}},{\"count\":0,\"name\":\"meta.subject_id\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"meta.subject_id.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"meta.subject_id\"}}},{\"count\":0,\"name\":\"nlp.acc\",\"type\":\"number\",\"esTypes\":[\"float\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"nlp.context_similarity\",\"type\":\"number\",\"esTypes\":[\"float\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"nlp.cui\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.cui.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.cui\"}}},{\"count\":0,\"name\":\"nlp.detected_name\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.detected_name.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.detected_name\"}}},{\"count\":0,\"name\":\"nlp.end\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"nlp.icd10\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.icd10.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.icd10\"}}},{\"count\":0,\"name\":\"nlp.id\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":1,\"name\":\"nlp.meta_anns.Presence.confidence\",\"type\":\"number\",\"esTypes\":[\"float\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":1,\"name\":\"nlp.meta_anns.Presence.name\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.meta_anns.Presence.name.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.meta_anns.Presence.name\"}}},{\"count\":1,\"name\":\"nlp.meta_anns.Presence.value\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.meta_anns.Presence.value.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.meta_anns.Presence.value\"}}},{\"count\":0,\"name\":\"nlp.meta_anns.Subject.confidence\",\"type\":\"number\",\"esTypes\":[\"float\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"nlp.meta_anns.Subject.name\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.meta_anns.Subject.name.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.meta_anns.Subject.name\"}}},{\"count\":0,\"name\":\"nlp.meta_anns.Subject.value\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.meta_anns.Subject.value.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.meta_anns.Subject.value\"}}},{\"count\":0,\"name\":\"nlp.meta_anns.Time.confidence\",\"type\":\"number\",\"esTypes\":[\"float\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"nlp.meta_anns.Time.name\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.meta_anns.Time.name.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.meta_anns.Time.name\"}}},{\"count\":0,\"name\":\"nlp.meta_anns.Time.value\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.meta_anns.Time.value.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.meta_anns.Time.value\"}}},{\"count\":0,\"name\":\"nlp.ontologies\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.ontologies.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.ontologies\"}}},{\"count\":0,\"name\":\"nlp.pretty_name\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.pretty_name.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.pretty_name\"}}},{\"count\":0,\"name\":\"nlp.source_value\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.source_value.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.source_value\"}}},{\"count\":0,\"name\":\"nlp.start\",\"type\":\"number\",\"esTypes\":[\"long\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"count\":0,\"name\":\"nlp.type_ids\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.type_ids.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.type_ids\"}}},{\"count\":0,\"name\":\"nlp.types\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"nlp.types.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"nlp.types\"}}},{\"count\":0,\"name\":\"service_model\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"service_model.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"service_model\"}}},{\"count\":0,\"name\":\"service_version\",\"type\":\"string\",\"esTypes\":[\"text\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"count\":0,\"name\":\"service_version.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"service_version\"}}},{\"count\":0,\"name\":\"timestamp\",\"type\":\"date\",\"esTypes\":[\"date\"],\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"discharge_annotations"},"id":"5e5f7820-003f-11f0-922f-1b0cde0c046d","migrationVersion":{"index-pattern":"7.6.0"},"references":[],"type":"index-pattern","updated_at":"2025-11-21T14:33:42.793Z","version":"WzEwOCwzMl0="} \ No newline at end of file diff --git a/helm-charts/cogstack-helm-ce/provisioning/generate_synthetic_bulk_ndjson.py b/helm-charts/cogstack-helm-ce/provisioning/generate_synthetic_bulk_ndjson.py index 46c924b..389a38e 100644 --- a/helm-charts/cogstack-helm-ce/provisioning/generate_synthetic_bulk_ndjson.py +++ b/helm-charts/cogstack-helm-ce/provisioning/generate_synthetic_bulk_ndjson.py @@ -39,6 +39,8 @@ "icustays", "patients", "poe", + "discharge", + "discharge_annotations", ) @@ -301,6 +303,205 @@ def emar_id(self) -> str: return f"{self.subject_id}-{self.emar_seq}" +@dataclass(frozen=True) +class DischargeNote: + note_id: str + subject_id: int + hadm_id: int + note_type: str + note_seq: int + charttime: datetime + storetime: datetime + text: str + + +def make_discharge_notes(rng: random.Random, admissions: Sequence[Admission]) -> List[DischargeNote]: + notes: List[DischargeNote] = [] + for adm in admissions: + note_type = "DS" + note_seq = bounded_int(rng, 1, 20) + note_id = f"{adm.subject_id}-{note_type}-{note_seq}" + + charttime = rand_datetime(rng, adm.admittime, adm.dischtime, resolution_seconds=60) + storetime = min(adm.dischtime, charttime + timedelta(hours=bounded_int(rng, 0, 24), minutes=bounded_int(rng, 0, 59))) + + # Synthetic discharge-note-like template (non-derivative, intentionally generic). + problems = ( + "Chest pain", + "Shortness of breath", + "Abdominal pain", + "Dizziness", + "Headache", + "Fever", + "Fatigue", + ) + services = ("MEDICINE", "SURGERY", "CARDIOLOGY", "NEUROLOGY", "RESPIRATORY", "RENAL") + meds = ("acetaminophen", "heparin", "pantoprazole", "insulin", "furosemide", "saline flush") + diag = choose(rng, problems) + service = choose(rng, services) + discharge_to = adm.discharge_location + med_list = ", ".join(rng.sample(meds, k=bounded_int(rng, 2, 4))) + + text = ( + "\n" + "Discharge Summary\n" + "=================\n" + f"Subject: {adm.subject_id}\n" + f"Admission ID: {adm.hadm_id}\n" + f"Service: {service}\n" + "\n" + "Chief Complaint:\n" + f"{diag}\n" + "\n" + "Hospital Course:\n" + "The patient was evaluated and treated during this admission. Symptoms improved with supportive care.\n" + "Vital signs remained stable. No acute complications were documented.\n" + "\n" + "Discharge Diagnoses:\n" + f"- {diag}\n" + "\n" + "Discharge Medications:\n" + f"- {med_list}\n" + "\n" + "Follow-up:\n" + "- Primary care follow-up in 1-2 weeks.\n" + "- Return to care if symptoms worsen.\n" + "\n" + "Disposition:\n" + f"{discharge_to}\n" + "\n" + ) + + notes.append( + DischargeNote( + note_id=note_id, + subject_id=adm.subject_id, + hadm_id=adm.hadm_id, + note_type=note_type, + note_seq=note_seq, + charttime=charttime, + storetime=storetime, + text=text, + ) + ) + return notes + + +def discharge_doc(n: DischargeNote) -> Dict[str, object]: + return { + "note_id": n.note_id, + "subject_id": n.subject_id, + "hadm_id": n.hadm_id, + "note_type": n.note_type, + "note_seq": n.note_seq, + "charttime": fmt_dt(n.charttime), + "storetime": fmt_dt(n.storetime), + "text": n.text, + } + + +def _rand_timestamp_utc_iso(rng: random.Random, base: datetime) -> str: + # Example: 2024-04-13T16:17:02.103+00:00 + dt = base + timedelta(seconds=bounded_int(rng, -3600 * 24 * 30, 3600 * 24 * 30)) + ms = bounded_int(rng, 0, 999) + return dt.strftime("%Y-%m-%dT%H:%M:%S") + f".{ms:03d}+00:00" + + +def make_discharge_annotations(rng: random.Random, notes: Sequence[DischargeNote]) -> List[Dict[str, object]]: + ann: List[Dict[str, object]] = [] + pretty_names = ( + "Usage", + "Dose", + "Route", + "Frequency", + "Condition", + "Procedure", + "Symptom", + ) + detected_terms = ( + "use", + "dose", + "oral", + "daily", + "pain", + "fever", + "follow-up", + "medication", + ) + # Keep ontology/model identifiers generic (avoid licensed/sensitive names). + ontologies = ( + "ONTOLOGY_A", + "ONTOLOGY_B", + "ONTOLOGY_C", + "ONTOLOGY_D", + ) + service_models = ("demo_model_a", "demo_model_b", "demo_model_c") + service_versions = ("1.10.2", "1.11.0", "2.0.0") + + for n in notes: + # Seeded random 1–20 annotations per note. + per_note = bounded_int(rng, 1, 20) + text_len = len(n.text) + for _ in range(per_note): + # Pick a safe span inside the text. + if text_len < 10: + start = 0 + end = text_len + else: + start = bounded_int(rng, 0, max(0, text_len - 2)) + span = bounded_int(rng, 1, min(25, max(1, text_len - start))) + end = min(text_len, start + span) + if end <= start: + end = min(text_len, start + 1) + + detected = choose(rng, detected_terms) + pretty = choose(rng, pretty_names) + cui = str(bounded_int(rng, 100_000_000, 999_999_999)) + + doc: Dict[str, object] = { + "nlp.cui": cui, + "enrich_top_level_concept": bounded_int(rng, 100_000, 999_999_999), + "nlp.pretty_name": pretty, + "nlp.end": end, + "nlp.types": [""], + "nlp.detected_name": detected, + "nlp.meta_anns": { + "Presence": { + "confidence": rng.random(), + "name": "Presence", + "value": choose(rng, ("Present", "Absent", "Hypothetical")), + }, + "Time": { + "confidence": rng.random(), + "name": "Time", + "value": choose(rng, ("Recent", "Historical", "Planned")), + }, + "Subject": { + "confidence": rng.random(), + "name": "Subject", + "value": choose(rng, ("Patient", "Family", "Clinician")), + }, + }, + "service_version": choose(rng, service_versions), + "nlp.start": start, + "nlp.source_value": detected, + "nlp.id": bounded_int(rng, 1, 10_000), + "meta.note_id": n.note_id, + "service_model": choose(rng, service_models), + "meta.subject_id": str(n.subject_id), + "nlp.icd10": [], + "nlp.snomed": [], + "nlp.acc": rng.random(), + "nlp.type_ids": [str(bounded_int(rng, 1_000_000, 99_999_999))], + "nlp.context_similarity": rng.random(), + "nlp.ontologies": [choose(rng, ontologies)], + "timestamp": _rand_timestamp_utc_iso(rng, n.storetime), + } + ann.append(doc) + + return ann + + def make_patients(rng: random.Random, n: int) -> List[Patient]: patients: List[Patient] = [] for subject_id in range(n): @@ -637,6 +838,8 @@ def iter_bulk_rows( icustays: Sequence[IcuStay], patients: Sequence[Patient], poe: Sequence[PoeOrder], + discharge: Sequence[DischargeNote], + discharge_annotations: Sequence[Dict[str, object]], ) -> Iterator[Row]: # Deterministic order by index, with _id 1..N per index. for i, a in enumerate(admissions, start=1): @@ -651,6 +854,10 @@ def iter_bulk_rows( yield ("patients", str(i), patient_doc(p)) for i, o in enumerate(poe, start=1): yield ("poe", str(i), poe_doc(o)) + for i, n in enumerate(discharge, start=1): + yield ("discharge", str(i), discharge_doc(n)) + for i, d in enumerate(discharge_annotations, start=1): + yield ("discharge_annotations", str(i), d) def write_bulk_ndjson(path: Path, rows: Iterable[Row]) -> None: @@ -662,10 +869,10 @@ def write_bulk_ndjson(path: Path, rows: Iterable[Row]) -> None: f.write(json.dumps(doc, ensure_ascii=False) + "\n") -def validate_bulk_ndjson(path: Path, expected_n: int) -> None: +def validate_bulk_ndjson(path: Path, expected_counts: Dict[str, int]) -> None: # Lightweight structural validation: alternating meta/doc, correct index names, correct counts. - expected_lines = 2 * (len(INDEX_ORDER) * expected_n) - index_counts: Dict[str, int] = {idx: 0 for idx in INDEX_ORDER} + expected_lines = 2 * sum(expected_counts.values()) + index_counts: Dict[str, int] = {idx: 0 for idx in expected_counts.keys()} with path.open("r", encoding="utf-8") as f: lines = f.readlines() @@ -685,19 +892,33 @@ def validate_bulk_ndjson(path: Path, expected_n: int) -> None: raise SystemExit(f"Validation failed: doc is not an object at line {i+2}") index_counts[idx] += 1 - for idx, count in index_counts.items(): - if count != expected_n: - raise SystemExit(f"Validation failed: index '{idx}' expected {expected_n} docs, got {count}") - - -def build_dataset(rng: random.Random, n: int) -> Tuple[List[Patient], List[Admission], List[IcuStay], List[PoeOrder], List[EmarEvent], List[dict]]: + for idx, expected in expected_counts.items(): + got = index_counts.get(idx, 0) + if got != expected: + raise SystemExit(f"Validation failed: index '{idx}' expected {expected} docs, got {got}") + + +def build_dataset( + rng: random.Random, n: int +) -> Tuple[ + List[Patient], + List[Admission], + List[IcuStay], + List[PoeOrder], + List[EmarEvent], + List[dict], + List[DischargeNote], + List[Dict[str, object]], +]: patients = make_patients(rng, n) admissions = make_admissions(rng, patients) icustays = make_icustays(rng, admissions) poe_orders = make_poe_orders(rng, admissions) emar_events = make_emar_events(rng, admissions, poe_orders) drg_docs = make_drgcodes(rng, admissions) - return patients, admissions, icustays, poe_orders, emar_events, drg_docs + discharge_notes = make_discharge_notes(rng, admissions) + discharge_anns = make_discharge_annotations(rng, discharge_notes) + return patients, admissions, icustays, poe_orders, emar_events, drg_docs, discharge_notes, discharge_anns def parse_args(argv: Sequence[str]) -> argparse.Namespace: @@ -721,7 +942,7 @@ def main(argv: Sequence[str]) -> int: raise SystemExit("--n must be > 0") rng = random.Random(args.seed) - patients, admissions, icustays, poe_orders, emar_events, drg_docs = build_dataset(rng, args.n) + patients, admissions, icustays, poe_orders, emar_events, drg_docs, discharge_notes, discharge_anns = build_dataset(rng, args.n) rows = iter_bulk_rows( admissions=admissions, @@ -730,11 +951,23 @@ def main(argv: Sequence[str]) -> int: icustays=icustays, patients=patients, poe=poe_orders, + discharge=discharge_notes, + discharge_annotations=discharge_anns, ) write_bulk_ndjson(args.out, rows) if args.validate: - validate_bulk_ndjson(args.out, args.n) + expected_counts: Dict[str, int] = { + "admissions": args.n, + "drgcodes": args.n, + "emar": args.n, + "icustays": args.n, + "patients": args.n, + "poe": args.n, + "discharge": len(discharge_notes), + "discharge_annotations": len(discharge_anns), + } + validate_bulk_ndjson(args.out, expected_counts) print(f"Completed synthetic data genration. File written to {args.out}") return 0