From a5a6557a6b31dcb4ec17be6874054e03f811698a Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Wed, 1 Oct 2025 11:06:55 +0100 Subject: [PATCH 1/5] feat(medcat-trainer): improve client api, not importing empty projects --- medcat-trainer/client/mctclient.py | 6 ++++-- medcat-trainer/webapp/api/api/data_utils.py | 10 +++++++--- medcat-trainer/webapp/api/api/views.py | 10 +++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/medcat-trainer/client/mctclient.py b/medcat-trainer/client/mctclient.py index e119c835..27da892f 100644 --- a/medcat-trainer/client/mctclient.py +++ b/medcat-trainer/client/mctclient.py @@ -595,7 +595,8 @@ def get_project_annos(self, projects: List[MCTProject]): def upload_projects_export(self, projects: Dict[str, Any], cdb: Union[MCTConceptDB, str]=None, vocab: Union[MCTVocab, str]=None, - modelpack: Union[MCTModelPack, str]=None): + modelpack: Union[MCTModelPack, str]=None, + import_project_name_suffix: str=' IMPORTED'): """Upload Trainer export as a list of projects to a MedCATTrainer instance. Args: @@ -612,7 +613,8 @@ def upload_projects_export(self, projects: Dict[str, Any], modelpack = [m for m in self.get_model_packs() if m.name == modelpack].pop() payload = { - 'exported_projects': projects + 'exported_projects': projects, + 'project_name_suffix': import_project_name_suffix } if cdb and vocab: diff --git a/medcat-trainer/webapp/api/api/data_utils.py b/medcat-trainer/webapp/api/api/data_utils.py index 27d41140..7199bd0a 100644 --- a/medcat-trainer/webapp/api/api/data_utils.py +++ b/medcat-trainer/webapp/api/api/data_utils.py @@ -3,7 +3,7 @@ import re from collections import defaultdict from datetime import datetime -from typing import Dict +from typing import Dict, List from django.contrib.auth.models import User from django.db import transaction @@ -66,10 +66,14 @@ def delete_orphan_docs(dataset: Dataset): Document.objects.filter(dataset__id=dataset.id).delete() -def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, modelpack_id: str): +def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, modelpack_id: str, + project_name_suffix: str=' IMPORTED'): for proj in medcat_export['projects']: + if len(proj['documents']) == 0: + # don't add projects with no documents + continue p = ProjectAnnotateEntities() - p.name = proj['name'] + ' IMPORTED' + p.name = f"{proj['name']}{project_name_suffix}" if len(proj['cuis']) > 1000: # store large CUI lists in a json file. cuis_file_name = MEDIA_ROOT + '/' + re.sub('/|\.', '_', p.name + '_cuis_file') + '.json' diff --git a/medcat-trainer/webapp/api/api/views.py b/medcat-trainer/webapp/api/api/views.py index 1fe2cb99..49282f05 100644 --- a/medcat-trainer/webapp/api/api/views.py +++ b/medcat-trainer/webapp/api/api/views.py @@ -686,13 +686,17 @@ def upload_deployment(request): cdb_id = deployment_export.get('cdb_id', None) vocab_id = deployment_export.get('vocab_id', None) modelpack_id = deployment_export.get('modelpack_id', None) + project_name_suffix = deployment_export.get('project_name_suffix', ' IMPORTED') if all(x is None for x in [cdb_id, vocab_id, modelpack_id]): return Response("No cdb, vocab, or modelpack provided", 400) - upload_projects_export(deployment_upload, cdb_id, vocab_id, modelpack_id) - # logger.info(f'Errors encountered during previous deployment upload\n{errs}') - return Response("successfully uploaded", 200) + try: + upload_projects_export(deployment_upload, cdb_id, vocab_id, modelpack_id, + project_name_suffix) + return Response("successfully uploaded", 200) + except Exception as e: + return Response(f"Failed to upload projects export: {str(e)}", 500) @api_view(http_method_names=['GET', 'DELETE']) From d1aee8fc337e65ecc7a7cf7694e007ce3d44d26a Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Wed, 1 Oct 2025 11:28:57 +0100 Subject: [PATCH 2/5] fix test --- medcat-trainer/client/tests/test_mctclient.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/medcat-trainer/client/tests/test_mctclient.py b/medcat-trainer/client/tests/test_mctclient.py index adea343e..e3a9bc9b 100644 --- a/medcat-trainer/client/tests/test_mctclient.py +++ b/medcat-trainer/client/tests/test_mctclient.py @@ -156,6 +156,7 @@ def post_side_effect(url, *args, **kwargs): headers=session.headers, json={ 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', 'cdb_id': '20', 'vocab_id': '30' } @@ -210,6 +211,7 @@ def post_side_effect(url, *args, **kwargs): headers=session.headers, json={ 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', 'cdb_id': '20', 'vocab_id': '30' } @@ -249,6 +251,7 @@ def post_side_effect(url, *args, **kwargs): headers=session.headers, json={ 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', 'modelpack_id': '40' } ) @@ -297,6 +300,7 @@ def post_side_effect(url, *args, **kwargs): headers=session.headers, json={ 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', 'modelpack_id': '40' } ) From d05680bb2948883aeeef86db0c75e72b72da326f Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Tue, 7 Oct 2025 01:14:11 +0100 Subject: [PATCH 3/5] feat(medcat-trainer): improve client for synthetic data gen validation --- medcat-trainer/client/mctclient.py | 57 +++++-- .../notebook_docs/Client_API_Tutorials.ipynb | 159 +++++------------- medcat-trainer/webapp/api/api/data_utils.py | 25 ++- ...edproject_cdb_search_filter_id_and_more.py | 42 +++++ medcat-trainer/webapp/api/api/models.py | 14 +- medcat-trainer/webapp/api/api/views.py | 15 +- 6 files changed, 169 insertions(+), 143 deletions(-) create mode 100644 medcat-trainer/webapp/api/api/migrations/0092_exportedproject_cdb_search_filter_id_and_more.py diff --git a/medcat-trainer/client/mctclient.py b/medcat-trainer/client/mctclient.py index 27da892f..7a4285a5 100644 --- a/medcat-trainer/client/mctclient.py +++ b/medcat-trainer/client/mctclient.py @@ -65,8 +65,6 @@ class MCTConceptDB(MCTObj): def __post_init__(self): if self.name is not None: - if not self.name[0].islower(): - raise ValueError("Name must start with a lowercase letter") if not self.name.replace('_', '').replace('-', '').isalnum(): raise ValueError("Name must contain only alphanumeric characters and underscores") @@ -90,26 +88,24 @@ def __str__(self): @dataclass -class MCTModelPack(MCTObj): - """A model pack in the MedCATTrainer instance. +class MCTMetaTask(MCTObj): + """A meta task in the MedCATTrainer instance. Attributes: - name (str): The name of the model pack. - model_pack_zip (str): The path to the model pack zip file, should be a .zip file. + name (str): The name of the meta task. """ name: str=None - model_pack_zip: str=None def __str__(self): - return f'{self.id} : {self.name} \t {self. model_pack_zip}' + return f'{self.id} : {self.name}' @dataclass -class MCTMetaTask(MCTObj): - """A meta task in the MedCATTrainer instance. +class MCTRelTask(MCTObj): + """A relation extraction task in the MedCATTrainer instance. Attributes: - name (str): The name of the meta task. + name (str): The name of the relation extraction task. """ name: str=None @@ -118,16 +114,22 @@ def __str__(self): @dataclass -class MCTRelTask(MCTObj): - """A relation extraction task in the MedCATTrainer instance. +class MCTModelPack(MCTObj): + """A model pack in the MedCATTrainer instance. Attributes: - name (str): The name of the relation extraction task. + name (str): The name of the model pack. + model_pack_zip (str): The path to the model pack zip file, should be a .zip file. """ name: str=None + model_pack_zip: str=None + concept_db: MCTConceptDB=None + vocab: MCTVocab=None + meta_cats: List[MCTMetaTask]=None def __str__(self): - return f'{self.id} : {self.name}' + return f'{self.id} : {self.name} \t {self. model_pack_zip}' + @dataclass @@ -520,7 +522,11 @@ def get_model_packs(self) -> List[MCTModelPack]: List[MCTModelPack]: A list of all MedCAT model packs in the MedCATTrainer instance """ resp = json.loads(requests.get(f'{self.server}/api/modelpacks/', headers=self.headers).text)['results'] - mct_model_packs = [MCTModelPack(id=mp['id'], name=mp['name'], model_pack_zip=mp['model_pack']) for mp in resp] + mct_model_packs = [MCTModelPack(id=mp['id'], name=mp['name'], model_pack_zip=mp['model_pack'], + concept_db=MCTConceptDB(id=mp['concept_db']), + vocab=MCTVocab(id=mp['vocab']), + meta_cats=[MCTMetaTask(id=mt) for mt in mp['meta_cats']]) + for mp in resp] return mct_model_packs def get_meta_tasks(self) -> List[MCTMetaTask]: @@ -596,7 +602,10 @@ def upload_projects_export(self, projects: Dict[str, Any], cdb: Union[MCTConceptDB, str]=None, vocab: Union[MCTVocab, str]=None, modelpack: Union[MCTModelPack, str]=None, - import_project_name_suffix: str=' IMPORTED'): + import_project_name_suffix: str=' IMPORTED', + cdb_search_filter: Union[MCTConceptDB, str]=None, + members: Union[List[MCTUser], List[str]]=None, + set_validated_docs: bool=False): """Upload Trainer export as a list of projects to a MedCATTrainer instance. Args: @@ -604,6 +613,10 @@ def upload_projects_export(self, projects: Dict[str, Any], cdb (Union[MCTConceptDB, str]): The concept database to be used in the project - CDB name or the MCTCDB Object vocab (Union[MCTVocab, str]): The vocabulary to be used in the project - Vocab name or the MCTVocab Object modelpack (Union[MCTModelPack, str]): The model pack to be used in the project - ModelPack name or the MCTModelPack Object + import_project_name_suffix (str): The suffix to be added to the project name + cdb_search_filter (Union[MCTConceptDB, str]): The concept database to be used in the project - CDB name or the MCTCDB Object + members (Union[List[MCTUser], List[str]]): The annotators for the project - List of MCTUser objects or list of user names + set_validated_docs (bool): Whether to set the validated documents, e.g. their annotation submit status. """ if isinstance(cdb, str): cdb = [c for c in self.get_concept_dbs() if c.name == cdb].pop() @@ -611,10 +624,18 @@ def upload_projects_export(self, projects: Dict[str, Any], vocab = [v for v in self.get_vocabs() if v.name == vocab].pop() if isinstance(modelpack, str): modelpack = [m for m in self.get_model_packs() if m.name == modelpack].pop() + if isinstance(cdb_search_filter, str): + cdb_search_filter = [c for c in self.get_concept_dbs() if c.name == cdb_search_filter].pop() + if isinstance(members, str): + members = [m for m in self.get_users() if m.username == members].pop() payload = { 'exported_projects': projects, - 'project_name_suffix': import_project_name_suffix + 'project_name_suffix': import_project_name_suffix, + 'cdb_search_filter': cdb_search_filter.id, + 'members': [m.id for m in members], + 'import_project_name_suffix': import_project_name_suffix, + 'set_validated_docs': set_validated_docs, } if cdb and vocab: diff --git a/medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb b/medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb index 5220262e..c3a13172 100644 --- a/medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb +++ b/medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -90,53 +90,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Users:\n", - "3 : annotator2\n", - "2 : annotator1\n", - "1 : admin\n", - "\n", - "Datasets:\n", - "1 : Example Dataset \t http://localhost:8001/media/Example_Dataset.csv\n", - "2 : Neurology Notes \t http://localhost:8001/media/neurology_notes.csv\n", - "3 : SG-example-docs \t http://localhost:8001/media/sg-sample-docs.csv\n", - "\n", - "Concept DBs:\n", - "1 : umls_cdb \t http://localhost:8001/media/cdb.dat\n", - "2 : snomed_cdb \t http://localhost:8001/media/snomed-cdb.dat\n", - "3 : snomed_2022_modelpack_CDB \t http://localhost:8001/media/Users/k1897038/projects/MedCATtrainer/webapp/api/media/20230227__kch_gstt_trained_model_494c3717f637bb89/cdb.dat\n", - "8 : medcat_full_pack_CDB \t http://localhost:8001/media/Users/k1897038/projects/MedCATtrainer/webapp/api/media/medcat_model_pack_u3fB9G5/cdb.dat\n", - "12 : snomed-2023-bert-metacats_CDB \t http://localhost:8001/media/Users/k1897038/projects/MedCATtrainer/webapp/api/media/20230227__kch_gstt_trained_model_bert_metacats_138689a7bb83cb0a/cdb.dat\n", - "13 : de_id_modelpack_CDB \t http://localhost:8001/media/Users/k1897038/projects/MedCATtrainer/webapp/api/media/medcat_deid_trained_a7120281ebb9fc9e/cdb.dat\n", - "\n", - "Vocabularies:\n", - "1 : http://localhost:8001/media/vocab.dat\n", - "3 : http://localhost:8001/media/20230227__kch_gstt_trained_model_494c3717f637bb89/vocab.dat\n", - "12 : http://localhost:8001/media/20230227__kch_gstt_trained_model_bert_metacats_138689a7bb83cb0a/vocab.dat\n", - "\n", - "ModelPacks:\n", - "1 : snomed_2022_modelpack \t http://localhost:8001/media/20230227__kch_gstt_trained_model_494c3717f637bb89.zip\n", - "9 : snomed-2023-bert-metacats \t http://localhost:8001/media/20230227__kch_gstt_trained_model_bert_metacats_138689a7bb83cb0a.zip\n", - "10 : de-id modelpack \t http://localhost:8001/media/medcat_deid_trained_a7120281ebb9fc9e.zip\n", - "\n", - "Meta Tasks:\n", - "1 : Experiencer\n", - "2 : Presence\n", - "3 : Subject\n", - "4 : Temporality\n", - "5 : Time\n", - "\n", - "Relation Tasks:\n", - "1 : Spatial\n" - ] - } - ], + "outputs": [], "source": [ "# Get users\n", "users = session.get_users()\n", @@ -304,37 +260,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Users:\n", - "2 : annotator1\n", - "1 : admin\n", - "\n", - "Datasets:\n", - "1 : Example Annotation Project - Model pack \t http://localhost:8001/media/cardio.csv\n", - "2 : Example Project - SNOMED CT All IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20IMPORTED_dataset.csv\n", - "3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n", - "\n", - "Concept DBs:\n", - "\n", - "Vocabularies:\n", - "\n", - "ModelPacks:\n", - "1 : snomed_2023_htn_modelpack \t http://localhost:8001/media/snomed_2023_base_model_dm_htn_copd_only_f86505ba72beff08.zipv2_48299cf9ff983030.zip\n", - "\n", - "Meta Tasks:\n", - "1 : Presence\n", - "2 : Subject\n", - "3 : Time\n", - "\n", - "Relation Tasks:\n", - "1 : Spatial\n" - ] - } - ], + "outputs": [], "source": [ "# Get users\n", "users = session.get_users()\n", @@ -408,15 +334,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created project with model pack: 2 : Demo General Medical Annotation \t Annotation of neurology medical conditions \t 3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n" - ] - } - ], + "outputs": [], "source": [ "# Method 2: Create a project with a modelpack\n", "\n", @@ -446,17 +364,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloaded annotations for 2 projects:\n", - "Example Project - SNOMED CT All - ModelPack IMPORTED\n", - "Demo General Medical Annotation\n" - ] - } - ], + "outputs": [], "source": [ "# Get all projects\n", "mct_projects = session.get_projects()\n", @@ -489,17 +397,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Annotations saved to ./example_data/medical_annotations.json\n" - ] - } - ], + "outputs": [], "source": [ "# Save MCT export / annotations to a file\n", "with open(\"./example_data/medical_annotations.json\", \"w\") as f:\n", @@ -518,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -527,20 +427,47 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "model_pack = session.get_model_packs()" + "model_packs = session.get_model_packs()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "session.upload_projects_export(projects, modelpack=model_pack[0])" + "users = session.get_users()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'successfully uploaded'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session.upload_projects_export(\n", + " projects,\n", + " modelpack=model_packs[1],\n", + " cdb_search_filter=model_packs[1].concept_db,\n", + " members=users,\n", + " import_project_name_suffix='imported4',\n", + " set_validated_docs=False\n", + ")" ] }, { @@ -553,9 +480,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:cattrainer]", + "display_name": "cattrainer", "language": "python", - "name": "conda-env-cattrainer-py" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/medcat-trainer/webapp/api/api/data_utils.py b/medcat-trainer/webapp/api/api/data_utils.py index 7199bd0a..9ecde91e 100644 --- a/medcat-trainer/webapp/api/api/data_utils.py +++ b/medcat-trainer/webapp/api/api/data_utils.py @@ -66,8 +66,17 @@ def delete_orphan_docs(dataset: Dataset): Document.objects.filter(dataset__id=dataset.id).delete() -def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, modelpack_id: str, - project_name_suffix: str=' IMPORTED'): +def upload_projects_export( + medcat_export: Dict, + cdb_id: str, + vocab_id: str, + modelpack_id: str, + project_name_suffix: str = ' IMPORTED', + cdb_search_filter_id: str = None, + members: List[str] = None, + import_project_name_suffix: str = ' IMPORTED', + set_validated_docs: bool = False +): for proj in medcat_export['projects']: if len(proj['documents']) == 0: # don't add projects with no documents @@ -119,6 +128,12 @@ def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, mode p.dataset = ds_mod p.save() + if cdb_search_filter_id is not None: + p.cdb_search_filter.set([ConceptDB.objects.get(id=cdb_search_filter_id)]) + + if members is not None: + p.members.set(members) + # create django ORM model instances that are referenced in the upload if they don't exist. for u in unavailable_users: logger.warning(f'Username: {u} - not present in this trainer deployment.') @@ -150,7 +165,11 @@ def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, mode r.label = rel r.save() - p.validated_documents.set(list(Document.objects.filter(dataset=ds_mod))) + if set_validated_docs: + p.validated_documents.set(list(Document.objects.filter(dataset=ds_mod))) + else: + p.validated_documents.clear() + for doc in proj['documents']: doc_mod = Document.objects.filter(Q(dataset=ds_mod) & Q(text=doc['text'])).first() diff --git a/medcat-trainer/webapp/api/api/migrations/0092_exportedproject_cdb_search_filter_id_and_more.py b/medcat-trainer/webapp/api/api/migrations/0092_exportedproject_cdb_search_filter_id_and_more.py new file mode 100644 index 00000000..e1b4db2b --- /dev/null +++ b/medcat-trainer/webapp/api/api/migrations/0092_exportedproject_cdb_search_filter_id_and_more.py @@ -0,0 +1,42 @@ +# Generated by Django 5.1.7 on 2025-10-07 00:03 + +import django.core.validators +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0091_exportedproject_cdb_id_exportedproject_modelpack_id_and_more'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddField( + model_name='exportedproject', + name='cdb_search_filter_id', + field=models.ForeignKey(blank=True, default=None, help_text='The CDB that will be used for concept lookup. This specific CDB should have been "imported" via the CDB admin screen', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='concept_source_exported_project', to='api.conceptdb'), + ), + migrations.AddField( + model_name='exportedproject', + name='import_project_name_suffix', + field=models.CharField(default=' IMPORTED', help_text='The suffix to be added to the project name', max_length=100), + ), + migrations.AddField( + model_name='exportedproject', + name='members', + field=models.ManyToManyField(blank=True, default=None, help_text='The annotators for the project', to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='exportedproject', + name='set_validated_docs', + field=models.BooleanField(default=False, help_text='Whether to set the validated documents, e.g. their annotation submit status.'), + ), + migrations.AlterField( + model_name='conceptdb', + name='name', + field=models.CharField(blank=True, default='', max_length=100, unique=True, validators=[django.core.validators.RegexValidator('^[a-zA-Z][a-zA-Z0-9_]*$', 'a-zA-Z for first character required. Alpahanumeric or _ thereafter are allowed for CDB names')]), + ), + ] diff --git a/medcat-trainer/webapp/api/api/models.py b/medcat-trainer/webapp/api/api/models.py index 00055bbe..d03aa8de 100644 --- a/medcat-trainer/webapp/api/api/models.py +++ b/medcat-trainer/webapp/api/api/models.py @@ -29,7 +29,7 @@ ] -cdb_name_validator = RegexValidator(r'^[a-zA-Z][a-zA-Z0-9_]*$', 'a-z for first character required. Alpahanumeric and _ thereafter are allowed for CDB names') +cdb_name_validator = RegexValidator(r'^[a-zA-Z][a-zA-Z0-9_]*$', 'a-zA-Z for first character required. Alpahanumeric or _ thereafter are allowed for CDB names') logger = logging.getLogger(__name__) @@ -110,9 +110,7 @@ def save(self, *args, **kwargs): except Exception as exc: raise MedCATLoadException(f'Failure loading MetaCAT models - {unpacked_model_pack_path}') from exc - # Only save if this is an update (not a new instance) - if not is_new: - super().save(*args, **kwargs) + super().save(*args, **kwargs) def __str__(self): return self.name @@ -524,9 +522,17 @@ def __str__(self): class ExportedProject(models.Model): trainer_export_file = models.FileField(help_text='Previously exported MedCATtrainer .json file') + import_project_name_suffix = models.CharField(max_length=100, default=' IMPORTED', help_text='The suffix to be added to the project name') + members = models.ManyToManyField(settings.AUTH_USER_MODEL, blank=True, default=None, help_text='The annotators for the project') cdb_id = models.ForeignKey('ConceptDB', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ConceptDB to be set for this exported project') vocab_id = models.ForeignKey('Vocabulary', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The Vocabulary to be set for this exported project') modelpack_id = models.ForeignKey('ModelPack', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ModelPack to be set for this exported project') + cdb_search_filter_id = models.ForeignKey('ConceptDB', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The CDB that will be used for concept lookup. ' + 'This specific CDB should have been "imported" ' + 'via the CDB admin screen', related_name='concept_source_exported_project') + set_validated_docs = models.BooleanField(default=False, help_text='Whether to set the validated documents, e.g. their annotation submit status.') + + def __str__(self): return f'{self.trainer_export_file.name} - {self.cdb_id} - {self.vocab_id} - {self.modelpack_id}' diff --git a/medcat-trainer/webapp/api/api/views.py b/medcat-trainer/webapp/api/api/views.py index 49282f05..7a06a01c 100644 --- a/medcat-trainer/webapp/api/api/views.py +++ b/medcat-trainer/webapp/api/api/views.py @@ -687,13 +687,24 @@ def upload_deployment(request): vocab_id = deployment_export.get('vocab_id', None) modelpack_id = deployment_export.get('modelpack_id', None) project_name_suffix = deployment_export.get('project_name_suffix', ' IMPORTED') + set_validated_docs = deployment_export.get('set_validated_docs', False) + cdb_search_filter_id = deployment_export.get('cdb_search_filter', None) + members = deployment_export.get('members', None) + import_project_name_suffix = deployment_export.get('import_project_name_suffix', ' IMPORTED') if all(x is None for x in [cdb_id, vocab_id, modelpack_id]): return Response("No cdb, vocab, or modelpack provided", 400) try: - upload_projects_export(deployment_upload, cdb_id, vocab_id, modelpack_id, - project_name_suffix) + upload_projects_export(deployment_upload, + cdb_id, + vocab_id, + modelpack_id, + project_name_suffix, + cdb_search_filter_id, + members, + import_project_name_suffix, + set_validated_docs) return Response("successfully uploaded", 200) except Exception as e: return Response(f"Failed to upload projects export: {str(e)}", 500) From 7f343258d5f5b65c5f25dc939539d9de0f537340 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Tue, 7 Oct 2025 01:31:17 +0100 Subject: [PATCH 4/5] tests for client changes --- medcat-trainer/client/mctclient.py | 8 +- medcat-trainer/client/tests/test_mctclient.py | 354 +++++++++++++++++- 2 files changed, 353 insertions(+), 9 deletions(-) diff --git a/medcat-trainer/client/mctclient.py b/medcat-trainer/client/mctclient.py index 7a4285a5..59119419 100644 --- a/medcat-trainer/client/mctclient.py +++ b/medcat-trainer/client/mctclient.py @@ -626,14 +626,14 @@ def upload_projects_export(self, projects: Dict[str, Any], modelpack = [m for m in self.get_model_packs() if m.name == modelpack].pop() if isinstance(cdb_search_filter, str): cdb_search_filter = [c for c in self.get_concept_dbs() if c.name == cdb_search_filter].pop() - if isinstance(members, str): - members = [m for m in self.get_users() if m.username == members].pop() + if members and all(isinstance(m, str) for m in members): + members = [m for m in self.get_users() if m.username in members] payload = { 'exported_projects': projects, 'project_name_suffix': import_project_name_suffix, - 'cdb_search_filter': cdb_search_filter.id, - 'members': [m.id for m in members], + 'cdb_search_filter': cdb_search_filter.id if cdb_search_filter else None, + 'members': [m.id for m in members] if members else None, 'import_project_name_suffix': import_project_name_suffix, 'set_validated_docs': set_validated_docs, } diff --git a/medcat-trainer/client/tests/test_mctclient.py b/medcat-trainer/client/tests/test_mctclient.py index e3a9bc9b..455417ee 100644 --- a/medcat-trainer/client/tests/test_mctclient.py +++ b/medcat-trainer/client/tests/test_mctclient.py @@ -158,7 +158,11 @@ def post_side_effect(url, *args, **kwargs): 'exported_projects': projects, 'project_name_suffix': ' IMPORTED', 'cdb_id': '20', - 'vocab_id': '30' + 'vocab_id': '30', + 'cdb_search_filter': None, + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False } ) self.assertEqual(result, mock_upload_response) @@ -213,7 +217,11 @@ def post_side_effect(url, *args, **kwargs): 'exported_projects': projects, 'project_name_suffix': ' IMPORTED', 'cdb_id': '20', - 'vocab_id': '30' + 'vocab_id': '30', + 'cdb_search_filter': None, + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False } ) self.assertEqual(result, mock_upload_response) @@ -252,7 +260,11 @@ def post_side_effect(url, *args, **kwargs): json={ 'exported_projects': projects, 'project_name_suffix': ' IMPORTED', - 'modelpack_id': '40' + 'modelpack_id': '40', + 'cdb_search_filter': None, + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False } ) self.assertEqual(result, mock_upload_response) @@ -266,7 +278,7 @@ def get_side_effect(url, *args, **kwargs): if url.endswith('/api/modelpacks/'): return MagicMock( status_code=200, - text=json.dumps({"results": [{"id": "40", "name": "testModelPack", "model_pack": "model.zip"}]}) + text=json.dumps({"results": [{"id": "40", "name": "testModelPack", "model_pack": "model.zip", "concept_db": "20", "vocab": "30", "meta_cats": ["200"]}]}) ) else: return MagicMock(status_code=404, text='') @@ -301,7 +313,11 @@ def post_side_effect(url, *args, **kwargs): json={ 'exported_projects': projects, 'project_name_suffix': ' IMPORTED', - 'modelpack_id': '40' + 'modelpack_id': '40', + 'cdb_search_filter': None, + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False } ) self.assertEqual(result, mock_upload_response) @@ -353,5 +369,333 @@ def post_side_effect(url, *args, **kwargs): self.assertIn('Failed to upload projects export', str(context.exception)) + @patch('mctclient.requests.post') + @patch('mctclient.requests.get') + def test_upload_projects_export_with_custom_suffix(self, mock_get, mock_post): + """Test upload_projects_export with custom import_project_name_suffix""" + # Mock authentication and upload responses + mock_upload_response = {"status": "success", "uploaded_projects": 1} + + def post_side_effect(url, *args, **kwargs): + if url.endswith('/api/api-token-auth/'): + return MagicMock(status_code=200, text='{"token": "abc"}') + elif url.endswith('/api/upload-deployment/'): + return MagicMock( + status_code=200, + json=lambda: mock_upload_response + ) + else: + return MagicMock(status_code=404, text='') + + mock_post.side_effect = post_side_effect + + session = MedCATTrainerSession(server='http://localhost', username='u', password='p') + cdb = MCTConceptDB(id='20', name='testCDB', conceptdb_file='cdb.dat') + vocab = MCTVocab(id='30', name='testVocab', vocab_file='vocab.dat') + projects = [{"id": 1, "name": "Project 1"}] + + result = session.upload_projects_export( + projects, + cdb=cdb, + vocab=vocab, + import_project_name_suffix=' - CUSTOM SUFFIX' + ) + + # Verify the API call was made correctly + mock_post.assert_called_with( + f'{session.server}/api/upload-deployment/', + headers=session.headers, + json={ + 'exported_projects': projects, + 'project_name_suffix': ' - CUSTOM SUFFIX', + 'cdb_id': '20', + 'vocab_id': '30', + 'cdb_search_filter': None, + 'members': None, + 'import_project_name_suffix': ' - CUSTOM SUFFIX', + 'set_validated_docs': False + } + ) + self.assertEqual(result, mock_upload_response) + + @patch('mctclient.requests.post') + @patch('mctclient.requests.get') + def test_upload_projects_export_with_cdb_search_filter_object(self, mock_get, mock_post): + """Test upload_projects_export with cdb_search_filter as MCTConceptDB object""" + # Mock authentication and upload responses + mock_upload_response = {"status": "success", "uploaded_projects": 1} + + def post_side_effect(url, *args, **kwargs): + if url.endswith('/api/api-token-auth/'): + return MagicMock(status_code=200, text='{"token": "abc"}') + elif url.endswith('/api/upload-deployment/'): + return MagicMock( + status_code=200, + json=lambda: mock_upload_response + ) + else: + return MagicMock(status_code=404, text='') + + mock_post.side_effect = post_side_effect + + session = MedCATTrainerSession(server='http://localhost', username='u', password='p') + cdb = MCTConceptDB(id='20', name='testCDB', conceptdb_file='cdb.dat') + vocab = MCTVocab(id='30', name='testVocab', vocab_file='vocab.dat') + cdb_search_filter = MCTConceptDB(id='25', name='searchFilterCDB', conceptdb_file='filter.dat') + projects = [{"id": 1, "name": "Project 1"}] + + result = session.upload_projects_export( + projects, + cdb=cdb, + vocab=vocab, + cdb_search_filter=cdb_search_filter + ) + + # Verify the API call was made correctly + mock_post.assert_called_with( + f'{session.server}/api/upload-deployment/', + headers=session.headers, + json={ + 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', + 'cdb_id': '20', + 'vocab_id': '30', + 'cdb_search_filter': '25', + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False + } + ) + self.assertEqual(result, mock_upload_response) + + @patch('mctclient.requests.post') + @patch('mctclient.requests.get') + def test_upload_projects_export_with_cdb_search_filter_string(self, mock_get, mock_post): + """Test upload_projects_export with cdb_search_filter as string name""" + # Mock get_concept_dbs response + def get_side_effect(url, *args, **kwargs): + if url.endswith('/api/concept-dbs/'): + return MagicMock( + status_code=200, + text=json.dumps({"results": [ + {"id": "20", "name": "testCDB", "cdb_file": "cdb.dat"}, + {"id": "25", "name": "searchFilterCDB", "cdb_file": "filter.dat"} + ]}) + ) + else: + return MagicMock(status_code=404, text='') + + mock_get.side_effect = get_side_effect + + # Mock authentication and upload responses + mock_upload_response = {"status": "success", "uploaded_projects": 1} + + def post_side_effect(url, *args, **kwargs): + if url.endswith('/api/api-token-auth/'): + return MagicMock(status_code=200, text='{"token": "abc"}') + elif url.endswith('/api/upload-deployment/'): + return MagicMock( + status_code=200, + json=lambda: mock_upload_response + ) + else: + return MagicMock(status_code=404, text='') + + mock_post.side_effect = post_side_effect + + session = MedCATTrainerSession(server='http://localhost', username='u', password='p') + cdb = MCTConceptDB(id='20', name='testCDB', conceptdb_file='cdb.dat') + vocab = MCTVocab(id='30', name='testVocab', vocab_file='vocab.dat') + projects = [{"id": 1, "name": "Project 1"}] + + result = session.upload_projects_export( + projects, + cdb=cdb, + vocab=vocab, + cdb_search_filter="searchFilterCDB" + ) + + # Verify the API call was made correctly + mock_post.assert_called_with( + f'{session.server}/api/upload-deployment/', + headers=session.headers, + json={ + 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', + 'cdb_id': '20', + 'vocab_id': '30', + 'cdb_search_filter': '25', + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False + } + ) + self.assertEqual(result, mock_upload_response) + + @patch('mctclient.requests.post') + @patch('mctclient.requests.get') + def test_upload_projects_export_with_members_objects(self, mock_get, mock_post): + """Test upload_projects_export with members as list of MCTUser objects""" + # Mock authentication and upload responses + mock_upload_response = {"status": "success", "uploaded_projects": 1} + + def post_side_effect(url, *args, **kwargs): + if url.endswith('/api/api-token-auth/'): + return MagicMock(status_code=200, text='{"token": "abc"}') + elif url.endswith('/api/upload-deployment/'): + return MagicMock( + status_code=200, + json=lambda: mock_upload_response + ) + else: + return MagicMock(status_code=404, text='') + + mock_post.side_effect = post_side_effect + + session = MedCATTrainerSession(server='http://localhost', username='u', password='p') + cdb = MCTConceptDB(id='20', name='testCDB', conceptdb_file='cdb.dat') + vocab = MCTVocab(id='30', name='testVocab', vocab_file='vocab.dat') + members = [MCTUser(id='100', username='user1'), MCTUser(id='101', username='user2')] + projects = [{"id": 1, "name": "Project 1"}] + + result = session.upload_projects_export( + projects, + cdb=cdb, + vocab=vocab, + members=members + ) + + # Verify the API call was made correctly + mock_post.assert_called_with( + f'{session.server}/api/upload-deployment/', + headers=session.headers, + json={ + 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', + 'cdb_id': '20', + 'vocab_id': '30', + 'cdb_search_filter': None, + 'members': ['100', '101'], + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False + } + ) + self.assertEqual(result, mock_upload_response) + + @patch('mctclient.requests.post') + @patch('mctclient.requests.get') + def test_upload_projects_export_with_members_strings(self, mock_get, mock_post): + """Test upload_projects_export with members as list of string usernames""" + # Mock get_users response + def get_side_effect(url, *args, **kwargs): + if url.endswith('/api/users/'): + return MagicMock( + status_code=200, + text=json.dumps({"results": [ + {"id": "100", "username": "user1"}, + {"id": "101", "username": "user2"} + ]}) + ) + else: + return MagicMock(status_code=404, text='') + + mock_get.side_effect = get_side_effect + + # Mock authentication and upload responses + mock_upload_response = {"status": "success", "uploaded_projects": 1} + + def post_side_effect(url, *args, **kwargs): + if url.endswith('/api/api-token-auth/'): + return MagicMock(status_code=200, text='{"token": "abc"}') + elif url.endswith('/api/upload-deployment/'): + return MagicMock( + status_code=200, + json=lambda: mock_upload_response + ) + else: + return MagicMock(status_code=404, text='') + + mock_post.side_effect = post_side_effect + + session = MedCATTrainerSession(server='http://localhost', username='u', password='p') + cdb = MCTConceptDB(id='20', name='testCDB', conceptdb_file='cdb.dat') + vocab = MCTVocab(id='30', name='testVocab', vocab_file='vocab.dat') + projects = [{"id": 1, "name": "Project 1"}] + + result = session.upload_projects_export( + projects, + cdb=cdb, + vocab=vocab, + members=["user1", "user2"] + ) + + # Verify the API call was made correctly + mock_post.assert_called_with( + f'{session.server}/api/upload-deployment/', + headers=session.headers, + json={ + 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', + 'cdb_id': '20', + 'vocab_id': '30', + 'cdb_search_filter': None, + 'members': ['100', '101'], + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False + } + ) + self.assertEqual(result, mock_upload_response) + + @patch('mctclient.requests.post') + @patch('mctclient.requests.get') + def test_upload_projects_export_handles_none_parameters(self, mock_get, mock_post): + """Test upload_projects_export handles None values for optional parameters gracefully""" + # Mock authentication and upload responses + mock_upload_response = {"status": "success", "uploaded_projects": 1} + + def post_side_effect(url, *args, **kwargs): + if url.endswith('/api/api-token-auth/'): + return MagicMock(status_code=200, text='{"token": "abc"}') + elif url.endswith('/api/upload-deployment/'): + return MagicMock( + status_code=200, + json=lambda: mock_upload_response + ) + else: + return MagicMock(status_code=404, text='') + + mock_post.side_effect = post_side_effect + + session = MedCATTrainerSession(server='http://localhost', username='u', password='p') + cdb = MCTConceptDB(id='20', name='testCDB', conceptdb_file='cdb.dat') + vocab = MCTVocab(id='30', name='testVocab', vocab_file='vocab.dat') + projects = [{"id": 1, "name": "Project 1"}] + + # This test verifies that the implementation properly handles None values + result = session.upload_projects_export( + projects, + cdb=cdb, + vocab=vocab, + cdb_search_filter=None, # This should be handled gracefully + members=None # This should be handled gracefully + ) + + # Verify the API call was made correctly with None values + mock_post.assert_called_with( + f'{session.server}/api/upload-deployment/', + headers=session.headers, + json={ + 'exported_projects': projects, + 'project_name_suffix': ' IMPORTED', + 'cdb_id': '20', + 'vocab_id': '30', + 'cdb_search_filter': None, + 'members': None, + 'import_project_name_suffix': ' IMPORTED', + 'set_validated_docs': False + } + ) + self.assertEqual(result, mock_upload_response) + if __name__ == '__main__': unittest.main() \ No newline at end of file From 75d8420453cd0da221db88482c66f3cee734d27d Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Tue, 7 Oct 2025 10:36:40 +0100 Subject: [PATCH 5/5] fix codeQL leak --- medcat-trainer/webapp/api/api/views.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-trainer/webapp/api/api/views.py b/medcat-trainer/webapp/api/api/views.py index 7a06a01c..feeec638 100644 --- a/medcat-trainer/webapp/api/api/views.py +++ b/medcat-trainer/webapp/api/api/views.py @@ -707,7 +707,8 @@ def upload_deployment(request): set_validated_docs) return Response("successfully uploaded", 200) except Exception as e: - return Response(f"Failed to upload projects export: {str(e)}", 500) + logger.error(f"Failed to upload projects export: {str(e)}", exc_info=e) + return Response(f"Failed to upload projects export: {e.message}", 500) @api_view(http_method_names=['GET', 'DELETE'])