From 0fcc996c3285273675868c94227cfb9cab5337ac Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Tue, 30 Sep 2025 16:32:20 +0100
Subject: [PATCH 1/8] Pushing update for metacat

Includes changes to data_utils
---
 v1/medcat/medcat/meta_cat.py                  |   6 +-
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 119 ++++++++++--------
 2 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/v1/medcat/medcat/meta_cat.py b/v1/medcat/medcat/meta_cat.py
index c646f44d8..8b922d488 100644
--- a/v1/medcat/medcat/meta_cat.py
+++ b/v1/medcat/medcat/meta_cat.py
@@ -252,7 +252,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
                 "The category name does not exist in this json file. You've provided '{}', "
                 "while the possible options are: {}. Additionally, ensure the populate the "
                 "'alternative_category_names' attribute to accommodate for variations.".format(
-                    category_name, " | ".join(list(data.keys()))))
+                    g_config['category_name'], " | ".join(list(data.keys()))))
 
         data = data[category_name]
         if data_oversampled:
@@ -263,12 +263,12 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
         if not category_value2id:
             # Encode the category values
             full_data, data_undersampled, category_value2id = encode_category_values(data,
-                                                                                     category_undersample=self.config.model.category_undersample,alternative_class_names=g_config['alternative_class_names'])
+                                                                                     alternative_class_names=g_config['alternative_class_names'],config=self.config)
         else:
             # We already have everything, just get the data
             full_data, data_undersampled, category_value2id = encode_category_values(data,
                                                                                      existing_category_value2id=category_value2id,
-                                                                                     category_undersample=self.config.model.category_undersample,alternative_class_names=g_config['alternative_class_names'])
+                                                                                     alternative_class_names=g_config['alternative_class_names'],config=self.config)
         g_config['category_value2id'] = category_value2id
         self.config.model['nclasses'] = len(category_value2id)
 
diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index 3fff06514..8b3b3faf1 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -154,7 +154,7 @@ def prepare_for_oversampled_data(data: List,
 
 
 def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None,
-                           category_undersample=None, alternative_class_names: List[List] = []) -> Tuple:
+                           alternative_class_names: List[List] = [], config=None) -> Tuple:
     """Converts the category values in the data outputted by `prepare_from_json`
     into integer values.
 
@@ -167,6 +167,8 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             Name of class that should be used to undersample the data (for 2 phase learning)
         alternative_class_names:
             Map that stores the variations of possible class names for the given category (task)
+        config:
+            MetaCAT config
 
     Returns:
         dict:
@@ -186,44 +188,48 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         category_value2id = {}
 
     category_values = set([x[2] for x in data])
-
-    # If categoryvalue2id is pre-defined, then making sure it is same as the labels found in the data
-    if len(category_value2id) != 0 and set(category_value2id.keys()) != category_values:
-        # if categoryvalue2id doesn't match the labels in the data, then 'alternative_class_names' has to be defined to check for variations
-        if len(alternative_class_names) == 0:
-            # Raise an exception since the labels don't match
-            raise Exception(
-                "The classes set in the config are not the same as the one found in the data. "
-                "The classes present in the config vs the ones found in the data - "
-                f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the "
-                "'alternative_class_names' attribute to accommodate for variations.")
-        updated_category_value2id = {}
-        for _class in category_value2id.keys():
-            if _class in category_values:
-                updated_category_value2id[_class] = category_value2id[_class]
-            else:
-                found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map]
-                failed_to_find = False
-                if len(found_in) != 0:
-                    class_name_matched = [label for label in found_in[0] if label in category_values]
-                    if len(class_name_matched) != 0:
-                        updated_category_value2id[class_name_matched[0]] = category_value2id[_class]
-                        logger.info("Class name '%s' does not exist in the data; however a variation of it "
-                                    "'%s' is present; updating it...", _class, class_name_matched[0])
+    if len(category_values)!=config.model.nclasses:
+        raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses)
+
+    # If categoryvalue2id is pre-defined or if all the classes aren't mentioned
+    if len(category_value2id) != 0:
+        # making sure it is same as the labels found in the data
+        if set(category_value2id.keys()) != category_values:
+            # if categoryvalue2id doesn't match the labels in the data, then 'alternative_class_names' has to be defined to check for variations
+            if len(alternative_class_names) == 0:
+                # Raise an exception since the labels don't match
+                raise Exception(
+                    "The classes set in the config are not the same as the one found in the data. "
+                    "The classes present in the config vs the ones found in the data - "
+                    f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the "
+                    "'alternative_class_names' attribute to accommodate for variations.")
+            updated_category_value2id = {}
+            for _class in category_value2id.keys():
+                if _class in category_values:
+                    updated_category_value2id[_class] = category_value2id[_class]
+                else:
+                    found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map]
+                    failed_to_find = False
+                    if len(found_in) != 0:
+                        class_name_matched = [label for label in found_in[0] if label in category_values]
+                        if len(class_name_matched) != 0:
+                            updated_category_value2id[class_name_matched[0]] = category_value2id[_class]
+                            logger.info("Class name '%s' does not exist in the data; however a variation of it "
+                                        "'%s' is present; updating it...", _class, class_name_matched[0])
+                        else:
+                            failed_to_find = True
                     else:
                         failed_to_find = True
-                else:
-                    failed_to_find = True
-                if failed_to_find:
-                    raise Exception("The classes set in the config are not the same as the one found in the data. "
-                                    "The classes present in the config vs the ones found in the data - "
-                                    f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the "
-                                    "populate the 'alternative_class_names' attribute to accommodate for variations.")
-        category_value2id = copy.deepcopy(updated_category_value2id)
-        logger.info("Updated categoryvalue2id mapping - %s", category_value2id)
+                    if failed_to_find:
+                        raise Exception("The classes set in the config are not the same as the one found in the data. "
+                                        "The classes present in the config vs the ones found in the data - "
+                                        f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the "
+                                        "populate the 'alternative_class_names' attribute to accommodate for variations.")
+            category_value2id = copy.deepcopy(updated_category_value2id)
+            logger.info("Updated categoryvalue2id mapping - %s", category_value2id)
 
     # Else create the mapping from the labels found in the data
-    else:
+    if len(category_value2id) != len(category_values):
         for c in category_values:
             if c not in category_value2id:
                 category_value2id[c] = len(category_value2id)
@@ -239,30 +245,33 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         if data[i][2] in category_value2id.values():
             label_data_[data[i][2]] = label_data_[data[i][2]] + 1
 
-    logger.info("Original number of samples per label: %s",label_data_)
-    # Undersampling data
-    if category_undersample is None or category_undersample == '':
-        min_label = min(label_data_.values())
+    logger.info("Original number of samples per label: %s", label_data_)
+
+    data_undersampled = []
+    if config and config.model.phase_number != 0:
+        # Undersampling data
+        category_undersample = config.model.category_undersample
+        if category_undersample is None or category_undersample == '':
+            min_label = min(label_data_.values())
 
-    else:
-        if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys():
-            min_label = label_data_[category_value2id[category_undersample]]
         else:
-            min_label = label_data_[category_undersample]
+            if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys():
+                min_label = label_data_[category_value2id[category_undersample]]
+            else:
+                min_label = label_data_[category_undersample]
 
-    data_undersampled = []
-    label_data_counter = {v: 0 for v in category_value2id.values()}
+        label_data_counter = {v: 0 for v in category_value2id.values()}
 
-    for sample in data:
-        if label_data_counter[sample[-1]] < min_label:
-            data_undersampled.append(sample)
-            label_data_counter[sample[-1]] += 1
-
-    label_data = {v: 0 for v in category_value2id.values()}
-    for i in range(len(data_undersampled)):
-        if data_undersampled[i][2] in category_value2id.values():
-            label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
-    logger.info("Updated number of samples per label (for 2-phase learning): %s",label_data)
+        for sample in data:
+            if label_data_counter[sample[-1]] < min_label:
+                data_undersampled.append(sample)
+                label_data_counter[sample[-1]] += 1
+
+        label_data = {v: 0 for v in category_value2id.values()}
+        for i in range(len(data_undersampled)):
+            if data_undersampled[i][2] in category_value2id.values():
+                label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
+        logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data)
 
     return data, data_undersampled, category_value2id
 

From 006f190303a1979f635f96dcd36837a27626d8c2 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Tue, 30 Sep 2025 16:50:23 +0100
Subject: [PATCH 2/8] Update data_utils.py

---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index 8b3b3faf1..91684323c 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -163,8 +163,6 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             Output of `prepare_from_json`.
         existing_category_value2id(Optional[Dict]):
             Map from category_value to id (old/existing).
-        category_undersample:
-            Name of class that should be used to undersample the data (for 2 phase learning)
         alternative_class_names:
             Map that stores the variations of possible class names for the given category (task)
         config:

From dc1a2bf5da7902e5a66699919bbf2918b9c19185 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Tue, 30 Sep 2025 17:27:39 +0100
Subject: [PATCH 3/8] Update data_utils.py

---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index 91684323c..8998ce24c 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -186,8 +186,10 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         category_value2id = {}
 
     category_values = set([x[2] for x in data])
-    if len(category_values)!=config.model.nclasses:
-        raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses)
+
+    if config:
+        if len(category_values)!=config.model.nclasses:
+            raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses)
 
     # If categoryvalue2id is pre-defined or if all the classes aren't mentioned
     if len(category_value2id) != 0:

From 089069e1104e54579c5fabca2209de82d0c934a9 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:04:51 +0100
Subject: [PATCH 4/8] Update data_utils.py

Creating helper functions for checking alternative class names and undersampling data
---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 153 ++++++++++++------
 1 file changed, 103 insertions(+), 50 deletions(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index 8998ce24c..11c854b7b 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -153,6 +153,99 @@ def prepare_for_oversampled_data(data: List,
     return data_sampled
 
 
+def find_alternate_classname(category_value2id, category_values, alternative_class_names):
+    """Helper function to find and map to alternative class names for the given category.
+        Example: For Temporality category, 'Recent' is an alternative to 'Present'.
+
+       Args:
+           category_value2id (Dict):
+                The pre-defined category_value2id
+           category_values (Set):
+                Contains the classes (labels) found in the data
+           alternative_class_names (List):
+                Contains the mapping of alternative class names
+
+       Returns:
+           category_value2id (Dict):
+                Updated category_value2id with keys corresponding to alternative class names
+
+       Raises:
+           Exception:
+                If no alternatives are found for labels in category_value2id that don't match any of the labels in the data
+                If the alternatives defined for labels in category_value2id that don't match any of the labels in the data
+                """
+
+    updated_category_value2id = {}
+    for _class in category_value2id.keys():
+        if _class in category_values:
+            updated_category_value2id[_class] = category_value2id[_class]
+        else:
+            found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map]
+            failed_to_find = False
+            if len(found_in) != 0:
+                class_name_matched = [label for label in found_in[0] if label in category_values]
+                if len(class_name_matched) != 0:
+                    updated_category_value2id[class_name_matched[0]] = category_value2id[_class]
+                    logger.info("Class name '%s' does not exist in the data; however a variation of it "
+                                "'%s' is present; updating it...", _class, class_name_matched[0])
+                else:
+                    failed_to_find = True
+            else:
+                failed_to_find = True
+            if failed_to_find:
+                raise Exception("The classes set in the config are not the same as the one found in the data. "
+                                "The classes present in the config vs the ones found in the data - "
+                                f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the "
+                                "populate the 'alternative_class_names' attribute to accommodate for variations.")
+    category_value2id = copy.deepcopy(updated_category_value2id)
+    logger.info("Updated categoryvalue2id mapping - %s", category_value2id)
+    return category_value2id
+
+
+def undersample_data(data,category_value2id,label_data_,config,):
+    """Undersamples the data for 2 phase learning
+
+        Args:
+            data (Dict):
+                Output of `prepare_from_json`.
+            category_value2id(Dict):
+                Map from category_value to id.
+            label_data_:
+                Map that stores the number of samples for each label
+            config:
+                MetaCAT config
+
+        Returns:
+            dict:
+                Undersampled data (for 2 phase learning) with integers inplace of strings for category values
+    """
+
+
+    data_undersampled = []
+    category_undersample = config.model.category_undersample
+    if category_undersample is None or category_undersample == '':
+        min_label = min(label_data_.values())
+
+    else:
+        if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys():
+            min_label = label_data_[category_value2id[category_undersample]]
+        else:
+            min_label = label_data_[category_undersample]
+
+    label_data_counter = {v: 0 for v in category_value2id.values()}
+
+    for sample in data:
+        if label_data_counter[sample[-1]] < min_label:
+            data_undersampled.append(sample)
+            label_data_counter[sample[-1]] += 1
+
+    label_data = {v: 0 for v in category_value2id.values()}
+    for i in range(len(data_undersampled)):
+        if data_undersampled[i][2] in category_value2id.values():
+            label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
+    logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data)
+    return data_undersampled
+
 def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None,
                            alternative_class_names: List[List] = [], config=None) -> Tuple:
     """Converts the category values in the data outputted by `prepare_from_json`
@@ -177,7 +270,9 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             Map from category value to ID for all categories in the data.
 
     Raises:
-        Exception: If categoryvalue2id is pre-defined and its labels do not match the labels found in the data
+        Exception:
+            If the number of classes in config do not match the number of classes found in the data
+            If category_value2id is pre-defined, its labels do not match the labels found in the data and alternative_class_names is empty
     """
     data = list(data)
     if existing_category_value2id is not None:
@@ -188,8 +283,10 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
     category_values = set([x[2] for x in data])
 
     if config:
-        if len(category_values)!=config.model.nclasses:
-            raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses)
+        if len(category_values) != config.model.nclasses:
+            raise Exception(
+                "The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",
+                len(category_values), config.model.nclasses)
 
     # If categoryvalue2id is pre-defined or if all the classes aren't mentioned
     if len(category_value2id) != 0:
@@ -203,30 +300,8 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
                     "The classes present in the config vs the ones found in the data - "
                     f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the "
                     "'alternative_class_names' attribute to accommodate for variations.")
-            updated_category_value2id = {}
-            for _class in category_value2id.keys():
-                if _class in category_values:
-                    updated_category_value2id[_class] = category_value2id[_class]
-                else:
-                    found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map]
-                    failed_to_find = False
-                    if len(found_in) != 0:
-                        class_name_matched = [label for label in found_in[0] if label in category_values]
-                        if len(class_name_matched) != 0:
-                            updated_category_value2id[class_name_matched[0]] = category_value2id[_class]
-                            logger.info("Class name '%s' does not exist in the data; however a variation of it "
-                                        "'%s' is present; updating it...", _class, class_name_matched[0])
-                        else:
-                            failed_to_find = True
-                    else:
-                        failed_to_find = True
-                    if failed_to_find:
-                        raise Exception("The classes set in the config are not the same as the one found in the data. "
-                                        "The classes present in the config vs the ones found in the data - "
-                                        f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the "
-                                        "populate the 'alternative_class_names' attribute to accommodate for variations.")
-            category_value2id = copy.deepcopy(updated_category_value2id)
-            logger.info("Updated categoryvalue2id mapping - %s", category_value2id)
+
+            category_value2id = alternative_class_names(category_value2id, category_values, alternative_class_names)
 
     # Else create the mapping from the labels found in the data
     if len(category_value2id) != len(category_values):
@@ -249,29 +324,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
 
     data_undersampled = []
     if config and config.model.phase_number != 0:
-        # Undersampling data
-        category_undersample = config.model.category_undersample
-        if category_undersample is None or category_undersample == '':
-            min_label = min(label_data_.values())
-
-        else:
-            if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys():
-                min_label = label_data_[category_value2id[category_undersample]]
-            else:
-                min_label = label_data_[category_undersample]
-
-        label_data_counter = {v: 0 for v in category_value2id.values()}
-
-        for sample in data:
-            if label_data_counter[sample[-1]] < min_label:
-                data_undersampled.append(sample)
-                label_data_counter[sample[-1]] += 1
-
-        label_data = {v: 0 for v in category_value2id.values()}
-        for i in range(len(data_undersampled)):
-            if data_undersampled[i][2] in category_value2id.values():
-                label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
-        logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data)
+        data_undersampled = undersample_data(data, category_value2id, label_data_, config)
 
     return data, data_undersampled, category_value2id
 

From 25153c03ae7845d924a5358bfa7ef80054eb2056 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:10:34 +0100
Subject: [PATCH 5/8] Update data_utils.py

---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index 11c854b7b..7bad4bfe3 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -246,6 +246,7 @@ def undersample_data(data,category_value2id,label_data_,config,):
     logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data)
     return data_undersampled
 
+
 def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None,
                            alternative_class_names: List[List] = [], config=None) -> Tuple:
     """Converts the category values in the data outputted by `prepare_from_json`
@@ -301,7 +302,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
                     f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the "
                     "'alternative_class_names' attribute to accommodate for variations.")
 
-            category_value2id = alternative_class_names(category_value2id, category_values, alternative_class_names)
+            category_value2id = find_alternate_classname(category_value2id, category_values, alternative_class_names)
 
     # Else create the mapping from the labels found in the data
     if len(category_value2id) != len(category_values):

From 5828cb25db806b58510e31773ec98198b8487358 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:26:59 +0100
Subject: [PATCH 6/8] Update data_utils.py

Changes for flake8
---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index 7bad4bfe3..e28bc228f 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -170,10 +170,9 @@ def find_alternate_classname(category_value2id, category_values, alternative_cla
                 Updated category_value2id with keys corresponding to alternative class names
 
        Raises:
-           Exception:
-                If no alternatives are found for labels in category_value2id that don't match any of the labels in the data
-                If the alternatives defined for labels in category_value2id that don't match any of the labels in the data
-                """
+           Exception: If no alternatives are found for labels in category_value2id that don't match any of the labels in the data
+           Exception: If the alternatives defined for labels in category_value2id that don't match any of the labels in the data
+    """
 
     updated_category_value2id = {}
     for _class in category_value2id.keys():
@@ -220,7 +219,6 @@ def undersample_data(data,category_value2id,label_data_,config,):
                 Undersampled data (for 2 phase learning) with integers inplace of strings for category values
     """
 
-
     data_undersampled = []
     category_undersample = config.model.category_undersample
     if category_undersample is None or category_undersample == '':
@@ -271,10 +269,10 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             Map from category value to ID for all categories in the data.
 
     Raises:
-        Exception:
-            If the number of classes in config do not match the number of classes found in the data
-            If category_value2id is pre-defined, its labels do not match the labels found in the data and alternative_class_names is empty
+        Exception: If the number of classes in config do not match the number of classes found in the data
+        Exception: If category_value2id is pre-defined, its labels do not match the labels found in the data and alternative_class_names is empty
     """
+
     data = list(data)
     if existing_category_value2id is not None:
         category_value2id = existing_category_value2id

From ba6e71ddf241ff39663adcf062030393ae67ac8b Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 1 Oct 2025 13:11:14 +0100
Subject: [PATCH 7/8] Update data_utils.py

---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index e28bc228f..f8e5b66fb 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Tuple, Iterable, List, Union
+from typing import Any, Dict, Optional, Tuple, Iterable, List, Union, Set
 from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
 import copy
 import logging
@@ -153,7 +153,7 @@ def prepare_for_oversampled_data(data: List,
     return data_sampled
 
 
-def find_alternate_classname(category_value2id, category_values, alternative_class_names):
+def find_alternate_classname(category_value2id: Dict, category_values: Set, alternative_class_names: List[List]) -> Dict:
     """Helper function to find and map to alternative class names for the given category.
         Example: For Temporality category, 'Recent' is an alternative to 'Present'.
 
@@ -201,7 +201,7 @@ def find_alternate_classname(category_value2id, category_values, alternative_cla
     return category_value2id
 
 
-def undersample_data(data,category_value2id,label_data_,config,):
+def undersample_data(data: List, category_value2id: Dict, label_data_,config) -> List:
     """Undersamples the data for 2 phase learning
 
         Args:
@@ -215,8 +215,8 @@ def undersample_data(data,category_value2id,label_data_,config,):
                 MetaCAT config
 
         Returns:
-            dict:
-                Undersampled data (for 2 phase learning) with integers inplace of strings for category values
+            data_undersampled (list):
+                Return the data created for 2 phase learning) with integers inplace of strings for category values
     """
 
     data_undersampled = []
@@ -261,11 +261,11 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             MetaCAT config
 
     Returns:
-        dict:
+        data (list):
             New data with integers inplace of strings for category values.
-        dict:
+        data_undersampled (list):
             New undersampled data (for 2 phase learning) with integers inplace of strings for category values
-        dict:
+        category_value2id (dict):
             Map from category value to ID for all categories in the data.
 
     Raises:

From 5eaed0fa7155e7bb1b4b635d575109b3b38e27d2 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 1 Oct 2025 13:22:05 +0100
Subject: [PATCH 8/8] Update data_utils.py

---
 v1/medcat/medcat/utils/meta_cat/data_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py
index f8e5b66fb..9272d1a2d 100644
--- a/v1/medcat/medcat/utils/meta_cat/data_utils.py
+++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py
@@ -205,7 +205,7 @@ def undersample_data(data: List, category_value2id: Dict, label_data_,config) ->
     """Undersamples the data for 2 phase learning
 
         Args:
-            data (Dict):
+            data (List):
                 Output of `prepare_from_json`.
             category_value2id(Dict):
                 Map from category_value to id.