Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7743236
interim changes for medcat-v2
Jun 12, 2025
29f3b76
updated TODOs
Jun 19, 2025
954608f
Update dependency to medcat v2
mart-r Jun 23, 2025
d143c9b
Update CDB/Vocab load to use the load classmethod again
mart-r Jun 23, 2025
9db1120
Move away from pkg_resources (deprecated)
mart-r Jun 25, 2025
530fbb0
Use v2 based API for loading addons (MetaCATs)
mart-r Jun 26, 2025
bc51626
Update MetaCAT loading
mart-r Jun 26, 2025
e77c14d
Update metrics to v2 format
mart-r Jun 27, 2025
570c14b
Do config parsing locally
mart-r Jun 27, 2025
13cf62a
Update to correct attribute name
mart-r Jun 27, 2025
1ca0cfa
Update solr utils to v2
mart-r Jun 30, 2025
fb20b87
Fix config access for v2
mart-r Jun 30, 2025
a26fad0
Remove addons from CDB config upon load
mart-r Jul 1, 2025
4cd8d62
Fix syntax error
mart-r Jul 1, 2025
ebe8dc2
Update Meta Annotation getting so as to avoid error if none set
mart-r Jul 1, 2025
4b40c76
Fix entity CUI / start/end char access
mart-r Jul 1, 2025
775261c
Fix some more entity detail access
mart-r Jul 1, 2025
181d668
Remove unigram table error (irrelevant / redundant)
mart-r Jul 1, 2025
bc0605d
Log more info regarding failure upon document preparation
mart-r Jul 1, 2025
3015c1e
Centralising clearnig CDB addons afer explicit load
mart-r Jul 2, 2025
5085bde
More specific import
mart-r Jul 2, 2025
e5b1a88
Clear CDB config addons everywhere if/when applicable
mart-r Jul 2, 2025
05ad081
Avoid circular imports by importing dynamically
mart-r Jul 2, 2025
9ea7a46
Correctly set CDB path within v2 model packs
mart-r Jul 2, 2025
6587f43
Update (very old) notebook to v2
mart-r Aug 6, 2025
d5b1861
Update (very old) notebook for v2 installation
mart-r Aug 6, 2025
c857e5c
CU-869aknppd: medcattrainer: upgrade dep
Sep 24, 2025
cc5718e
CU-869an9w8y: medcat-trainer: fix: imported exported projects needs t…
Sep 30, 2025
1eb4edb
Merge branch 'main' into upload-export-docs
tomolopolis Sep 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 58 additions & 2 deletions medcat-trainer/client/mctclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import os
from abc import ABC
from typing import List, Tuple, Union
from typing import Any, Dict, List, Tuple, Union

import requests

Expand Down Expand Up @@ -495,6 +495,24 @@ def get_models(self) -> Tuple[List[str], List[str]]:
mct_vocabs = [MCTVocab(id=v['id'], name=v['name'], vocab_file=v['vocab_file']) for v in vocabs]
return mct_cdbs, mct_vocabs

def get_concept_dbs(self) -> List[MCTConceptDB]:
"""Get all concept databases in the MedCATTrainer instance.

Returns:
List[MCTConceptDB]: A list of all concept databases in the MedCATTrainer instance
"""
cdbs = json.loads(requests.get(f'{self.server}/api/concept-dbs/', headers=self.headers).text)['results']
return [MCTConceptDB(id=cdb['id'], name=cdb['name'], conceptdb_file=cdb['cdb_file']) for cdb in cdbs]

def get_vocabs(self) -> List[MCTVocab]:
"""Get all vocabularies in the MedCATTrainer instance.

Returns:
List[MCTVocab]: A list of all vocabularies in the MedCATTrainer instance
"""
vocabs = json.loads(requests.get(f'{self.server}/api/vocabs/', headers=self.headers).text)['results']
return [MCTVocab(id=v['id'], name=v['name'], vocab_file=v['vocab_file']) for v in vocabs]

def get_model_packs(self) -> List[MCTModelPack]:
"""Get all MedCAT model packs in the MedCATTrainer instance.

Expand Down Expand Up @@ -559,7 +577,7 @@ def get_datasets(self) -> List[MCTDataset]:
return mct_datasets

def get_project_annos(self, projects: List[MCTProject]):
"""Get the annotations for a list of projects. Schema is documented here: https://github.com/medcat/MedCATtrainer/blob/main/docs/api.md#download-annotations
"""Get the annotations for a list of projects.

Args:
projects (List[MCTProject]): A list of projects to get annotations for
Expand All @@ -574,6 +592,44 @@ def get_project_annos(self, projects: List[MCTProject]):
headers=self.headers).text)
return resp

def upload_projects_export(self, projects: Dict[str, Any],
cdb: Union[MCTConceptDB, str]=None,
vocab: Union[MCTVocab, str]=None,
modelpack: Union[MCTModelPack, str]=None):
"""Upload Trainer export as a list of projects to a MedCATTrainer instance.

Args:
projects (List[MCTProject]): A list of projects to upload
cdb (Union[MCTConceptDB, str]): The concept database to be used in the project - CDB name or the MCTCDB Object
vocab (Union[MCTVocab, str]): The vocabulary to be used in the project - Vocab name or the MCTVocab Object
modelpack (Union[MCTModelPack, str]): The model pack to be used in the project - ModelPack name or the MCTModelPack Object
"""
if isinstance(cdb, str):
cdb = [c for c in self.get_concept_dbs() if c.name == cdb].pop()
if isinstance(vocab, str):
vocab = [v for v in self.get_vocabs() if v.name == vocab].pop()
if isinstance(modelpack, str):
modelpack = [m for m in self.get_model_packs() if m.name == modelpack].pop()

payload = {
'exported_projects': projects
}

if cdb and vocab:
payload['cdb_id'] = cdb.id
payload['vocab_id'] = vocab.id
elif modelpack:
payload['modelpack_id'] = modelpack.id
else:
raise MCTUtilsException('No cdb, vocab, or modelpack provided, use a ')

resp = requests.post(f'{self.server}/api/upload-deployment/', headers=self.headers,
json=payload)
if 200 <= resp.status_code < 300:
return resp.json()
else:
raise MCTUtilsException(f'Failed to upload projects export: {resp.text}')

def __str__(self) -> str:
return f'{self.server} \t {self.username} \t {self.password}'

Expand Down
114 changes: 102 additions & 12 deletions medcat-trainer/notebook_docs/Client_API_Tutorials.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -90,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -304,7 +304,37 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Users:\n",
"2 : annotator1\n",
"1 : admin\n",
"\n",
"Datasets:\n",
"1 : Example Annotation Project - Model pack \t http://localhost:8001/media/cardio.csv\n",
"2 : Example Project - SNOMED CT All IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20IMPORTED_dataset.csv\n",
"3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n",
"\n",
"Concept DBs:\n",
"\n",
"Vocabularies:\n",
"\n",
"ModelPacks:\n",
"1 : snomed_2023_htn_modelpack \t http://localhost:8001/media/snomed_2023_base_model_dm_htn_copd_only_f86505ba72beff08.zipv2_48299cf9ff983030.zip\n",
"\n",
"Meta Tasks:\n",
"1 : Presence\n",
"2 : Subject\n",
"3 : Time\n",
"\n",
"Relation Tasks:\n",
"1 : Spatial\n"
]
}
],
"source": [
"# Get users\n",
"users = session.get_users()\n",
Expand Down Expand Up @@ -378,7 +408,15 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Created project with model pack: 2 : Demo General Medical Annotation \t Annotation of neurology medical conditions \t 3 : Example Project - SNOMED CT All - ModelPack IMPORTED_dataset \t http://localhost:8001/media/Users/k1897038/projects/cogstack-nlp/medcat-trainer/webapp/api/media/Example%20Project%20-%20SNOMED%20CT%20All%20-%20ModelPack%20IMPORTED_dataset.csv\n"
]
}
],
"source": [
"# Method 2: Create a project with a modelpack\n",
"\n",
Expand Down Expand Up @@ -408,7 +446,17 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded annotations for 2 projects:\n",
"Example Project - SNOMED CT All - ModelPack IMPORTED\n",
"Demo General Medical Annotation\n"
]
}
],
"source": [
"# Get all projects\n",
"mct_projects = session.get_projects()\n",
Expand Down Expand Up @@ -436,15 +484,22 @@
"metadata": {},
"source": [
"## 6. Saving Annotations for Analysis\n",
"\n",
"Finally, let's save the annotations to a file for later analysis:"
"Once annotations have been collected they can be downloaded."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Annotations saved to ./example_data/medical_annotations.json\n"
]
}
],
"source": [
"# Save MCT export / annotations to a file\n",
"with open(\"./example_data/medical_annotations.json\", \"w\") as f:\n",
Expand All @@ -453,6 +508,41 @@
"print(\"Annotations saved to ./example_data/medical_annotations.json\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Save Annotations as a Project\n",
"Annotatons can be 'imported' into a trainer instance:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"projects = json.load(open('./example_data/MedCAT_Export_With_Text_2020-05-22_10_34_09.json'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"model_pack = session.get_model_packs()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"session.upload_projects_export(projects, modelpack=model_pack[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -463,9 +553,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "bioext-medcat-env",
"display_name": "Python [conda env:cattrainer]",
"language": "python",
"name": "python3"
"name": "conda-env-cattrainer-py"
},
"language_info": {
"codemirror_mode": {
Expand Down
2 changes: 1 addition & 1 deletion medcat-trainer/webapp/api/api/admin/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class VocabularyAdmin(admin.ModelAdmin):
model = Vocabulary
list_display = ('name', 'create_time', 'last_modified', 'last_modified_by')
fields = ('name', 'vocab_file', 'create_time', 'last_modified', 'last_modified_by')

def save_model(self, request, obj, form, change):
obj.last_modified_by = request.user
super().save_model(request, obj, form, change)
Expand Down
18 changes: 13 additions & 5 deletions medcat-trainer/webapp/api/api/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def delete_orphan_docs(dataset: Dataset):
Document.objects.filter(dataset__id=dataset.id).delete()


def upload_projects_export(medcat_export: Dict):
def upload_projects_export(medcat_export: Dict, cdb_id: str, vocab_id: str, modelpack_id: str):
for proj in medcat_export['projects']:
p = ProjectAnnotateEntities()
p.name = proj['name'] + ' IMPORTED'
Expand All @@ -79,6 +79,14 @@ def upload_projects_export(medcat_export: Dict):
else:
p.cuis = proj['cuis']

if cdb_id is not None and vocab_id is not None:
p.concept_db = ConceptDB.objects.get(id=cdb_id)
p.vocab = Vocabulary.objects.get(id=vocab_id)
elif modelpack_id is not None:
p.model_pack = ModelPack.objects.get(id=modelpack_id)
else:
raise InvalidParameterError("No cdb, vocab, or modelpack provided")

# ensure current deployment has the neccessary - Entity, MetaTak, Relation, and warn on not present User objects.
ent_labels, meta_tasks, rels, unavailable_users, available_users = set(), defaultdict(set), set(), set(), dict()
for doc in proj['documents']:
Expand Down Expand Up @@ -196,13 +204,13 @@ def upload_projects_export(medcat_export: Dict):
# link relations with start and end anno ents
er.start_entity = anno_to_doc_ind[relation['start_entity_start_idx']]
er.end_entity = anno_to_doc_ind[relation['end_entity_start_idx']]
try:
if relation.get('create_time') is not None:
er.create_time = datetime.strptime(relation['create_time'], _dt_fmt)
except ValueError:
else:
er.create_time = datetime.now()
try:
if relation.get('last_modified_time') is not None:
er.last_modified = datetime.strptime(relation['last_modified_time'], _dt_fmt)
except ValueError:
else:
er.last_modified = datetime.now()
er.save()
logger.info(f"Finished annotation import for project {proj['name']}")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 5.1.7 on 2025-09-29 16:16

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0090_merge_20250623_1330'),
]

operations = [
migrations.AddField(
model_name='exportedproject',
name='cdb_id',
field=models.ForeignKey(blank=True, default=None, help_text='The ConceptDB to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.conceptdb'),
),
migrations.AddField(
model_name='exportedproject',
name='modelpack_id',
field=models.ForeignKey(blank=True, default=None, help_text='The ModelPack to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.modelpack'),
),
migrations.AddField(
model_name='exportedproject',
name='vocab_id',
field=models.ForeignKey(blank=True, default=None, help_text='The Vocabulary to be set for this exported project', null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.vocabulary'),
),
]
5 changes: 4 additions & 1 deletion medcat-trainer/webapp/api/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,9 +524,12 @@ def __str__(self):

class ExportedProject(models.Model):
trainer_export_file = models.FileField(help_text='Previously exported MedCATtrainer .json file')
cdb_id = models.ForeignKey('ConceptDB', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ConceptDB to be set for this exported project')
vocab_id = models.ForeignKey('Vocabulary', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The Vocabulary to be set for this exported project')
modelpack_id = models.ForeignKey('ModelPack', on_delete=models.SET_NULL, blank=True, null=True, default=None, help_text='The ModelPack to be set for this exported project')

def __str__(self):
return self.trainer_export_file.name
return f'{self.trainer_export_file.name} - {self.cdb_id} - {self.vocab_id} - {self.modelpack_id}'


class ProjectMetrics(models.Model):
Expand Down
14 changes: 13 additions & 1 deletion medcat-trainer/webapp/api/api/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,19 @@ def remove_dataset_file(sender, instance, **kwargs):
def save_exported_projects(sender, instance, **kwargs):
if not instance.trainer_export_file.path.endswith('.json'):
raise Exception("Please make sure the file is a .json file")
upload_projects_export(json.load(open(instance.trainer_export_file.path)))
cdb = instance.cdb_id
vocab = instance.vocab_id
modelpack = instance.modelpack_id

cdb = None if cdb is None else cdb.id
vocab = None if vocab is None else vocab.id
modelpack = None if modelpack is None else modelpack.id

upload_projects_export(
json.load(open(instance.trainer_export_file.path)),
cdb_id=cdb,
vocab_id=vocab,
modelpack_id=modelpack)


@receiver(pre_delete, sender=ModelPack)
Expand Down
Loading