CogStack · phoevos · Aug 29, 2025 · Aug 28, 2025 · Aug 29, 2025
diff --git a/.github/workflows/api-docs.yaml b/.github/workflows/api-docs.yaml
@@ -33,6 +33,7 @@ jobs:
         run: |
           python app/cli/cli.py export-model-apis --model-type medcat_snomed --add-training-apis --no-exclude-unsupervised-training --no-exclude-metacat-training --add-evaluation-apis --add-previews-apis
           python app/cli/cli.py export-model-apis --model-type medcat_icd10 --add-training-apis --no-exclude-unsupervised-training --no-exclude-metacat-training --add-evaluation-apis --add-previews-apis
+          python app/cli/cli.py export-model-apis --model-type medcat_opcs4 --add-training-apis --no-exclude-unsupervised-training --no-exclude-metacat-training --add-evaluation-apis --add-previews-apis
           python app/cli/cli.py export-model-apis --model-type medcat_umls --add-training-apis --no-exclude-unsupervised-training --no-exclude-metacat-training --add-evaluation-apis --add-previews-apis
           python app/cli/cli.py export-model-apis --model-type anoncat --add-training-apis --add-evaluation-apis --add-previews-apis --exclude-metacat-training --exclude-unsupervised-training
           python app/cli/cli.py export-model-apis --model-type transformers_deid --add-training-apis --add-evaluation-apis --add-previews-apis --exclude-metacat-training --exclude-unsupervised-training
@@ -43,6 +44,7 @@ jobs:
           git checkout gh-pages
           mv ./medcat_snomed_model_apis.json ./docs/medcat_snomed_model_apis.json
           mv ./medcat_icd10_model_apis.json ./docs/medcat_icd10_model_apis.json
+          mv ./medcat_opcs4_model_apis.json ./docs/medcat_opcs4_model_apis.json
           mv ./medcat_umls_model_apis.json ./docs/medcat_umls_model_apis.json
           mv ./anoncat_model_apis.json ./docs/anoncat_model_apis.json
           mv ./transformers_deid_model_apis.json ./docs/transformers_deid_model_apis.json
@@ -51,7 +53,7 @@ jobs:
           mv ./cogstack_model_serve_apis.json ./docs/cogstack_model_serve_apis.json
           git config --global user.name "cogstack-model-serve"
           git config --global user.email "cogstack-model-serve@users.noreply.github.com"
-          git add ./docs/medcat_snomed_model_apis.json ./docs/medcat_icd10_model_apis.json ./docs/medcat_umls_model_apis.json ./docs/anoncat_model_apis.json ./docs/transformers_deid_model_apis.json ./docs/huggingface_ner_model_apis.json ./docs/huggingface_llm_model_apis.json ./docs/cogstack_model_serve_apis.json
+          git add ./docs/medcat_snomed_model_apis.json ./docs/medcat_icd10_model_apis.json ./docs/medcat_opcs4_model_apis.json ./docs/medcat_umls_model_apis.json ./docs/anoncat_model_apis.json ./docs/transformers_deid_model_apis.json ./docs/huggingface_ner_model_apis.json ./docs/huggingface_llm_model_apis.json ./docs/cogstack_model_serve_apis.json
           if [[ `git status --porcelain --untracked-files=no` ]]; then
             git commit -m "update api docs"
           else

diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ Currently, CMS offers both HTTP endpoints for running NLP-related jobs and a com
 [OpenAPI Docs](https://cogstack.github.io/CogStack-ModelServe/):
 - [SNOMED MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_snomed_model_apis.html)
 - [ICD-10 MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_icd10_model_apis.html)
+- [OPCS-4 MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_opcs4_model_apis.html)
 - [UMLS MedCAT Model](https://cogstack.github.io/CogStack-ModelServe/docs/medcat_umls_model_apis.html)
 - [De-ID MedCAT Model (AnonCAT)](https://cogstack.github.io/CogStack-ModelServe/docs/anoncat_model_apis.html)
 - [HuggingFace NER Model](https://cogstack.github.io/CogStack-ModelServe/docs/huggingface_ner_model_apis.html)
@@ -59,6 +60,7 @@ The following table summarises the servable model types with their respective ou
 |:---------------------:|:---------------:|:---------------------------------:|
 |     medcat_snomed     |  medcat-snomed  |   labelled with SNOMED concepts   |
 |     medcat_icd10      |  medcat-icd10   |   labelled with ICD-10 concepts   |
+|     medcat_opcs4      |  medcat-opcs4   |   labelled with OPCS-4 concepts   |
 |      medcat_umls      |   medcat-umls   |    labelled with UMLS concepts    |
 | medcat_deid (anoncat) |   medcat-deid   | labelled with latest PII concepts |
 |    huggingface_ner    | huggingface_ner |      customer managed labels      |

diff --git a/app/api/routers/unsupervised_training.py b/app/api/routers/unsupervised_training.py
@@ -162,7 +162,7 @@ async def train_unsupervised_with_hf_dataset(
     if hf_dataset_repo_id is None and hf_dataset_package is None:
         raise ClientException("Either 'hf_dataset_repo_id' or 'hf_dataset_package' must be provided")
 
-    if model_service.info().model_type not in [ModelType.HUGGINGFACE_NER, ModelType.MEDCAT_SNOMED, ModelType.MEDCAT_ICD10, ModelType.MEDCAT_UMLS]:
+    if model_service.info().model_type not in [ModelType.HUGGINGFACE_NER, ModelType.MEDCAT_SNOMED, ModelType.MEDCAT_ICD10, ModelType.MEDCAT_OPCS4, ModelType.MEDCAT_UMLS]:
         raise ConfigurationException(f"Currently this endpoint is not available for models of type: {model_service.info().model_type.value}")
 
     data_dir = tempfile.TemporaryDirectory()

diff --git a/app/cli/README.md b/app/cli/README.md
@@ -37,7 +37,7 @@ $ cms serve [OPTIONS]
 
 **Options**:
 
-* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to serve  [required]
+* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_opcs4|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to serve  [required]
 * `--model-path TEXT`: The file path to the model package
 * `--mlflow-model-uri models:/MODEL_NAME/ENV`: The URI of the MLflow model to serve
 * `--host TEXT`: The hostname of the server  [default: 127.0.0.1]
@@ -60,7 +60,7 @@ $ cms train [OPTIONS]
 
 **Options**:
 
-* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to train  [required]
+* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_opcs4|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to train  [required]
 * `--base-model-path TEXT`: The file path to the base model package to be trained on
 * `--mlflow-model-uri models:/MODEL_NAME/ENV`: The URI of the MLflow model to train
 * `--training-type [supervised|unsupervised|meta_supervised]`: The type of training  [required]
@@ -86,7 +86,7 @@ $ cms register [OPTIONS]
 
 **Options**:
 
-* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to register  [required]
+* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_opcs4|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to register  [required]
 * `--model-path TEXT`: The file path to the model package  [required]
 * `--model-name TEXT`: The string representation of the registered model  [required]
 * `--training-type [supervised|unsupervised|meta_supervised]`: The type of training the model went through
@@ -108,7 +108,7 @@ $ cms export-model-apis [OPTIONS]
 
 **Options**:
 
-* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to serve  [required]
+* `--model-type [medcat_snomed|medcat_umls|medcat_icd10|medcat_opcs4|medcat_deid|anoncat|transformers_deid|huggingface_ner]`: The type of the model to serve  [required]
 * `--add-training-apis / --no-add-training-apis`: Add training APIs to the doc  [default: no-add-training-apis]
 * `--add-evaluation-apis / --no-add-evaluation-apis`: Add evaluation APIs to the doc  [default: no-add-evaluation-apis]
 * `--add-previews-apis / --no-add-previews-apis`: Add preview APIs to the doc  [default: no-add-previews-apis]

diff --git a/app/cli/cli.py b/app/cli/cli.py
@@ -65,7 +65,7 @@ def serve_model(
     port: str = typer.Option("8000", help="The port of the server"),
     model_name: Optional[str] = typer.Option(None, help="The string representation of the model name"),
     streamable: bool = typer.Option(False, help="Serve the streamable endpoints only"),
-    device: Device = typer.Option(Device.DEFAULT, help="The device to serve the model on"),
+    device: Device = typer.Option(Device.DEFAULT.value, help="The device to serve the model on"),
     llm_engine: Optional[LlmEngine] = typer.Option(LlmEngine.CMS.value, help="The engine to use for text generation"),
     debug: Optional[bool] = typer.Option(None, help="Run in the debug mode"),
 ) -> None:
@@ -90,7 +90,7 @@ def serve_model(
     model_name = model_name or "CMS model"
     logger = _get_logger(debug, model_type, model_name)
     config = get_settings()
-    config.DEVICE = device.value
+    config.DEVICE = device
     if model_type in [
         ModelType.HUGGINGFACE_NER,
         ModelType.MEDCAT_DEID,
@@ -186,7 +186,7 @@ def train_model(
     hyperparameters: str = typer.Option("{}", help="The overriding hyperparameters serialised as JSON string"),
     description: Optional[str] = typer.Option(None, help="The description of the training or change logs"),
     model_name: Optional[str] = typer.Option(None, help="The string representation of the model name"),
-    device: Device = typer.Option(Device.DEFAULT, help="The device to train the model on"),
+    device: Device = typer.Option(Device.DEFAULT.value, help="The device to train the model on"),
     debug: Optional[bool] = typer.Option(None, help="Run in the debug mode"),
 ) -> None:
     """
@@ -212,7 +212,7 @@ def train_model(
     logger = _get_logger(debug, model_type, model_name)
 
     config = get_settings()
-    config.DEVICE = device.value
+    config.DEVICE = device
 
     model_service_dep = ModelServiceDep(model_type, config)
     cms_globals.model_service_dep = model_service_dep

diff --git a/app/domain.py b/app/domain.py
@@ -10,6 +10,7 @@ class ModelType(str, Enum):
     MEDCAT_SNOMED = "medcat_snomed"
     MEDCAT_UMLS = "medcat_umls"
     MEDCAT_ICD10 = "medcat_icd10"
+    MEDCAT_OPCS4 = "medcat_opcs4"
     MEDCAT_DEID = "medcat_deid"
     ANONCAT = "anoncat"
     TRANSFORMERS_DEID = "transformers_deid"

diff --git a/app/model_services/medcat_model.py b/app/model_services/medcat_model.py
@@ -165,7 +165,7 @@ def annotate(self, text: str) -> List[Annotation]:
 
         doc = self.model.get_entities(
             text,
-            addl_info=["cui2icd10", "cui2ontologies", "cui2snomed", "cui2athena_ids"],
+            addl_info=["cui2icd10", "cui2opcs4", "cui2ontologies", "cui2snomed", "cui2athena_ids"],
         )
         return [load_pydantic_object_from_dict(Annotation, record) for record in self.get_records_from_doc(doc)]
 
@@ -186,7 +186,7 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
             self._data_iterator(texts),
             batch_size_chars=batch_size_chars,
             nproc=max(int(cpu_count() / 2), 1),
-            addl_info=["cui2icd10", "cui2ontologies", "cui2snomed", "cui2athena_ids"],
+            addl_info=["cui2icd10", "cui2opcs4", "cui2ontologies", "cui2snomed", "cui2athena_ids"],
         )
         docs = dict(sorted(docs.items(), key=lambda x: x[0]))
         annotations_list = []

diff --git a/app/model_services/medcat_model_opcs4.py b/app/model_services/medcat_model_opcs4.py
@@ -0,0 +1,121 @@
+import logging
+import pandas as pd
+from typing import Dict, Optional, final, List
+
+from app import __version__ as app_version
+from app.model_services.medcat_model import MedCATModel
+from app.config import Settings
+from app.domain import ModelCard, ModelType
+
+logger = logging.getLogger("cms")
+
+
+@final
+class MedCATModelOpcs4(MedCATModel):
+    """A model service for MedCAT OPCS-4 models."""
+
+    OPCS4_KEY = "opcs4"
+
+    def __init__(
+        self,
+        config: Settings,
+        model_parent_dir: Optional[str] = None,
+        enable_trainer: Optional[bool] = None,
+        model_name: Optional[str] = None,
+        base_model_file: Optional[str] = None,
+    ) -> None:
+        """
+        Initialises the MedCAT OPCS-4 model service with specified configurations.
+
+        Args:
+            config (Settings): The configuration for the model service.
+            model_parent_dir (Optional[str]): The directory where the model package is stored. Defaults to None.
+            enable_trainer (Optional[bool]): The flag to enable or disable trainers. Defaults to None.
+            model_name (Optional[str]): The name of the model. Defaults to None.
+            base_model_file (Optional[str]): The model package file name. Defaults to None.
+        """
+        super().__init__(
+            config,
+            model_parent_dir=model_parent_dir,
+            enable_trainer=enable_trainer,
+            model_name=model_name,
+            base_model_file=base_model_file,
+        )
+        self.model_name = model_name or "OPCS-4 MedCAT model"
+
+    @property
+    def api_version(self) -> str:
+        """Getter for the API version of the model service."""
+
+        # APP version is used although each model service could have its own API versioning
+        return app_version
+
+    def info(self) -> ModelCard:
+        """
+        Retrieves information about the MedCAT OPCS-4 model.
+
+        Returns:
+            ModelCard: A card containing information about the MedCAT OPCS-4 model.
+        """
+
+        return ModelCard(
+            model_description=self.model_name,
+            model_type=ModelType.MEDCAT_OPCS4,
+            api_version=self.api_version,
+            model_card=self.model.get_model_card(as_dict=True),
+        )
+
+    def get_records_from_doc(self, doc: Dict) -> List[Dict]:
+        """
+        Extracts and formats entity records from a document dictionary.
+
+        Args:
+            doc (Dict): The document dictionary containing extracted named entities.
+
+        Returns:
+            List[Dict]: A list of formatted entity records.
+        """
+
+        df = pd.DataFrame(doc["entities"].values())
+
+        if df.empty:
+            df = pd.DataFrame(columns=["label_name", "label_id", "start", "end", "accuracy"])
+        else:
+            new_rows = []
+            for _, row in df.iterrows():
+                if self.OPCS4_KEY not in row or not row[self.OPCS4_KEY]:
+                    logger.debug("No mapped OPCS-4 code associated with the entity: %s", row)
+                else:
+                    for opcs4 in row[self.OPCS4_KEY]:
+                        output_row = row.copy()
+                        if isinstance(opcs4, str):
+                            output_row[self.OPCS4_KEY] = opcs4
+                        elif isinstance(opcs4, dict):
+                            output_row[self.OPCS4_KEY] = opcs4.get("code")
+                            output_row["pretty_name"] = opcs4.get("name")
+                        elif isinstance(opcs4, list) and opcs4:
+                            output_row[self.OPCS4_KEY] = opcs4[-1]
+                        else:
+                            logger.error("Unknown format for the OPCS-4 code(s): %s", opcs4)
+                        if "athena_ids" in output_row and output_row["athena_ids"]:
+                            output_row["athena_ids"] = [
+                                athena_id["code"] for athena_id in output_row["athena_ids"]
+                            ]
+                    new_rows.append(output_row)
+            if new_rows:
+                df = pd.DataFrame(new_rows)
+                df.rename(
+                    columns={
+                        "pretty_name": "label_name",
+                        self.OPCS4_KEY: "label_id",
+                        "types": "categories",
+                        "acc": "accuracy",
+                        "athena_ids": "athena_ids",
+                    },
+                    inplace=True,
+                )
+                df = self._retrieve_meta_annotations(df)
+            else:
+                df = pd.DataFrame(columns=["label_name", "label_id", "start", "end", "accuracy"])
+        records = df.to_dict("records")
+        return records
diff --git a/app/registry.py b/app/registry.py
@@ -3,6 +3,7 @@
 from app.model_services.medcat_model_snomed import MedCATModelSnomed
 from app.model_services.medcat_model_umls import MedCATModelUmls
 from app.model_services.medcat_model_icd10 import MedCATModelIcd10
+from app.model_services.medcat_model_opcs4 import MedCATModelOpcs4
 from app.model_services.medcat_model_deid import MedCATModelDeIdentification
 from app.model_services.huggingface_ner_model import HuggingFaceNerModel
 from app.model_services.huggingface_llm_model import HuggingFaceLlmModel
@@ -11,6 +12,7 @@
     ModelType.MEDCAT_SNOMED: MedCATModelSnomed,
     ModelType.MEDCAT_UMLS: MedCATModelUmls,
     ModelType.MEDCAT_ICD10: MedCATModelIcd10,
+    ModelType.MEDCAT_OPCS4: MedCATModelOpcs4,
     ModelType.MEDCAT_DEID: MedCATModelDeIdentification,
     ModelType.ANONCAT: MedCATModelDeIdentification,
     ModelType.TRANSFORMERS_DEID: TransformersModelDeIdentification,

diff --git a/app/utils.py b/app/utils.py
@@ -63,6 +63,7 @@ def get_code_base_uri(model_name: str) -> Optional[str]:
     code_base_uris = {
         CodeType.SNOMED.value: "http://snomed.info/id",
         CodeType.ICD10.value: "https://icdcodelookup.com/icd-10/codes",
+        CodeType.OPCS4.value: "https://nhsengland.kahootz.com/t_c_home/view?objectID=14270896",
         CodeType.UMLS.value: "https://uts.nlm.nih.gov/uts/umls/concept",
     }
     for code_name, base_uri in code_base_uris.items():

diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
@@ -81,6 +81,44 @@ services:
       - https_proxy=$HTTPS_PROXY
       - no_proxy=localhost
 
+  medcat-opcs4:
+    extends:
+      file: ./docker-compose.yml
+      service: medcat-opcs4
+    labels:
+      - org.cogstack.model-serve.dev=true
+    build:
+        context: ./
+        dockerfile: ./docker/medcat-opcs4/Dockerfile
+        args:
+          - CMS_MODEL_NAME=OPCS-4 MedCAT model
+          - CMS_UID=${CMS_UID:-1000}
+          - CMS_GID=${CMS_GID:-1000}
+          - HTTP_PROXY=$HTTP_PROXY
+          - HTTPS_PROXY=$HTTPS_PROXY
+          - NO_PROXY=$NO_PROXY
+    image: local-cms-medcat-opcs4:do-not-push
+    environment:
+      - BASE_MODEL_FULL_PATH=$MODEL_PACKAGE_FULL_PATH
+      - AWS_ACCESS_KEY_ID=
+      - AWS_SECRET_ACCESS_KEY=
+      - MLFLOW_S3_ENDPOINT_URL=
+      - MLFLOW_TRACKING_URI=${MLFLOW_TRACKING_URI:-file:/tmp/mlruns}
+      - MLFLOW_TRACKING_USERNAME=
+      - MLFLOW_TRACKING_PASSWORD=
+      - MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=${MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING:-false}
+      - GELF_INPUT_URI=
+      - AUTH_USER_ENABLED=${AUTH_USER_ENABLED:-false}
+      - AUTH_JWT_SECRET=$AUTH_JWT_SECRET
+      - AUTH_ACCESS_TOKEN_EXPIRE_SECONDS=${AUTH_ACCESS_TOKEN_EXPIRE_SECONDS:-3600}
+      - AUTH_DATABASE_URL=${AUTH_DATABASE_URL:-sqlite+aiosqlite:///./cms-users.db}
+      - HTTP_PROXY=$HTTP_PROXY
+      - HTTPS_PROXY=$HTTPS_PROXY
+      - NO_PROXY=localhost
+      - http_proxy=$HTTP_PROXY
+      - https_proxy=$HTTPS_PROXY
+      - no_proxy=localhost
+
   medcat-deid:
     extends:
       file: ./docker-compose.yml