From 0fcc996c3285273675868c94227cfb9cab5337ac Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:32:20 +0100 Subject: [PATCH 1/8] Pushing update for metacat Includes changes to data_utils --- v1/medcat/medcat/meta_cat.py | 6 +- v1/medcat/medcat/utils/meta_cat/data_utils.py | 119 ++++++++++-------- 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/v1/medcat/medcat/meta_cat.py b/v1/medcat/medcat/meta_cat.py index c646f44d8..8b922d488 100644 --- a/v1/medcat/medcat/meta_cat.py +++ b/v1/medcat/medcat/meta_cat.py @@ -252,7 +252,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data "The category name does not exist in this json file. You've provided '{}', " "while the possible options are: {}. Additionally, ensure the populate the " "'alternative_category_names' attribute to accommodate for variations.".format( - category_name, " | ".join(list(data.keys())))) + g_config['category_name'], " | ".join(list(data.keys())))) data = data[category_name] if data_oversampled: @@ -263,12 +263,12 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data if not category_value2id: # Encode the category values full_data, data_undersampled, category_value2id = encode_category_values(data, - category_undersample=self.config.model.category_undersample,alternative_class_names=g_config['alternative_class_names']) + alternative_class_names=g_config['alternative_class_names'],config=self.config) else: # We already have everything, just get the data full_data, data_undersampled, category_value2id = encode_category_values(data, existing_category_value2id=category_value2id, - category_undersample=self.config.model.category_undersample,alternative_class_names=g_config['alternative_class_names']) + alternative_class_names=g_config['alternative_class_names'],config=self.config) g_config['category_value2id'] = category_value2id self.config.model['nclasses'] = len(category_value2id) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index 3fff06514..8b3b3faf1 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -154,7 +154,7 @@ def prepare_for_oversampled_data(data: List, def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None, - category_undersample=None, alternative_class_names: List[List] = []) -> Tuple: + alternative_class_names: List[List] = [], config=None) -> Tuple: """Converts the category values in the data outputted by `prepare_from_json` into integer values. @@ -167,6 +167,8 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict Name of class that should be used to undersample the data (for 2 phase learning) alternative_class_names: Map that stores the variations of possible class names for the given category (task) + config: + MetaCAT config Returns: dict: @@ -186,44 +188,48 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict category_value2id = {} category_values = set([x[2] for x in data]) - - # If categoryvalue2id is pre-defined, then making sure it is same as the labels found in the data - if len(category_value2id) != 0 and set(category_value2id.keys()) != category_values: - # if categoryvalue2id doesn't match the labels in the data, then 'alternative_class_names' has to be defined to check for variations - if len(alternative_class_names) == 0: - # Raise an exception since the labels don't match - raise Exception( - "The classes set in the config are not the same as the one found in the data. " - "The classes present in the config vs the ones found in the data - " - f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the " - "'alternative_class_names' attribute to accommodate for variations.") - updated_category_value2id = {} - for _class in category_value2id.keys(): - if _class in category_values: - updated_category_value2id[_class] = category_value2id[_class] - else: - found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map] - failed_to_find = False - if len(found_in) != 0: - class_name_matched = [label for label in found_in[0] if label in category_values] - if len(class_name_matched) != 0: - updated_category_value2id[class_name_matched[0]] = category_value2id[_class] - logger.info("Class name '%s' does not exist in the data; however a variation of it " - "'%s' is present; updating it...", _class, class_name_matched[0]) + if len(category_values)!=config.model.nclasses: + raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses) + + # If categoryvalue2id is pre-defined or if all the classes aren't mentioned + if len(category_value2id) != 0: + # making sure it is same as the labels found in the data + if set(category_value2id.keys()) != category_values: + # if categoryvalue2id doesn't match the labels in the data, then 'alternative_class_names' has to be defined to check for variations + if len(alternative_class_names) == 0: + # Raise an exception since the labels don't match + raise Exception( + "The classes set in the config are not the same as the one found in the data. " + "The classes present in the config vs the ones found in the data - " + f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the " + "'alternative_class_names' attribute to accommodate for variations.") + updated_category_value2id = {} + for _class in category_value2id.keys(): + if _class in category_values: + updated_category_value2id[_class] = category_value2id[_class] + else: + found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map] + failed_to_find = False + if len(found_in) != 0: + class_name_matched = [label for label in found_in[0] if label in category_values] + if len(class_name_matched) != 0: + updated_category_value2id[class_name_matched[0]] = category_value2id[_class] + logger.info("Class name '%s' does not exist in the data; however a variation of it " + "'%s' is present; updating it...", _class, class_name_matched[0]) + else: + failed_to_find = True else: failed_to_find = True - else: - failed_to_find = True - if failed_to_find: - raise Exception("The classes set in the config are not the same as the one found in the data. " - "The classes present in the config vs the ones found in the data - " - f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the " - "populate the 'alternative_class_names' attribute to accommodate for variations.") - category_value2id = copy.deepcopy(updated_category_value2id) - logger.info("Updated categoryvalue2id mapping - %s", category_value2id) + if failed_to_find: + raise Exception("The classes set in the config are not the same as the one found in the data. " + "The classes present in the config vs the ones found in the data - " + f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the " + "populate the 'alternative_class_names' attribute to accommodate for variations.") + category_value2id = copy.deepcopy(updated_category_value2id) + logger.info("Updated categoryvalue2id mapping - %s", category_value2id) # Else create the mapping from the labels found in the data - else: + if len(category_value2id) != len(category_values): for c in category_values: if c not in category_value2id: category_value2id[c] = len(category_value2id) @@ -239,30 +245,33 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict if data[i][2] in category_value2id.values(): label_data_[data[i][2]] = label_data_[data[i][2]] + 1 - logger.info("Original number of samples per label: %s",label_data_) - # Undersampling data - if category_undersample is None or category_undersample == '': - min_label = min(label_data_.values()) + logger.info("Original number of samples per label: %s", label_data_) + + data_undersampled = [] + if config and config.model.phase_number != 0: + # Undersampling data + category_undersample = config.model.category_undersample + if category_undersample is None or category_undersample == '': + min_label = min(label_data_.values()) - else: - if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys(): - min_label = label_data_[category_value2id[category_undersample]] else: - min_label = label_data_[category_undersample] + if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys(): + min_label = label_data_[category_value2id[category_undersample]] + else: + min_label = label_data_[category_undersample] - data_undersampled = [] - label_data_counter = {v: 0 for v in category_value2id.values()} + label_data_counter = {v: 0 for v in category_value2id.values()} - for sample in data: - if label_data_counter[sample[-1]] < min_label: - data_undersampled.append(sample) - label_data_counter[sample[-1]] += 1 - - label_data = {v: 0 for v in category_value2id.values()} - for i in range(len(data_undersampled)): - if data_undersampled[i][2] in category_value2id.values(): - label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 - logger.info("Updated number of samples per label (for 2-phase learning): %s",label_data) + for sample in data: + if label_data_counter[sample[-1]] < min_label: + data_undersampled.append(sample) + label_data_counter[sample[-1]] += 1 + + label_data = {v: 0 for v in category_value2id.values()} + for i in range(len(data_undersampled)): + if data_undersampled[i][2] in category_value2id.values(): + label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 + logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data) return data, data_undersampled, category_value2id From 006f190303a1979f635f96dcd36837a27626d8c2 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:50:23 +0100 Subject: [PATCH 2/8] Update data_utils.py --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index 8b3b3faf1..91684323c 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -163,8 +163,6 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict Output of `prepare_from_json`. existing_category_value2id(Optional[Dict]): Map from category_value to id (old/existing). - category_undersample: - Name of class that should be used to undersample the data (for 2 phase learning) alternative_class_names: Map that stores the variations of possible class names for the given category (task) config: From dc1a2bf5da7902e5a66699919bbf2918b9c19185 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Tue, 30 Sep 2025 17:27:39 +0100 Subject: [PATCH 3/8] Update data_utils.py --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index 91684323c..8998ce24c 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -186,8 +186,10 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict category_value2id = {} category_values = set([x[2] for x in data]) - if len(category_values)!=config.model.nclasses: - raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses) + + if config: + if len(category_values)!=config.model.nclasses: + raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses) # If categoryvalue2id is pre-defined or if all the classes aren't mentioned if len(category_value2id) != 0: From 089069e1104e54579c5fabca2209de82d0c934a9 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 1 Oct 2025 11:04:51 +0100 Subject: [PATCH 4/8] Update data_utils.py Creating helper functions for checking alternative class names and undersampling data --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 153 ++++++++++++------ 1 file changed, 103 insertions(+), 50 deletions(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index 8998ce24c..11c854b7b 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -153,6 +153,99 @@ def prepare_for_oversampled_data(data: List, return data_sampled +def find_alternate_classname(category_value2id, category_values, alternative_class_names): + """Helper function to find and map to alternative class names for the given category. + Example: For Temporality category, 'Recent' is an alternative to 'Present'. + + Args: + category_value2id (Dict): + The pre-defined category_value2id + category_values (Set): + Contains the classes (labels) found in the data + alternative_class_names (List): + Contains the mapping of alternative class names + + Returns: + category_value2id (Dict): + Updated category_value2id with keys corresponding to alternative class names + + Raises: + Exception: + If no alternatives are found for labels in category_value2id that don't match any of the labels in the data + If the alternatives defined for labels in category_value2id that don't match any of the labels in the data + """ + + updated_category_value2id = {} + for _class in category_value2id.keys(): + if _class in category_values: + updated_category_value2id[_class] = category_value2id[_class] + else: + found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map] + failed_to_find = False + if len(found_in) != 0: + class_name_matched = [label for label in found_in[0] if label in category_values] + if len(class_name_matched) != 0: + updated_category_value2id[class_name_matched[0]] = category_value2id[_class] + logger.info("Class name '%s' does not exist in the data; however a variation of it " + "'%s' is present; updating it...", _class, class_name_matched[0]) + else: + failed_to_find = True + else: + failed_to_find = True + if failed_to_find: + raise Exception("The classes set in the config are not the same as the one found in the data. " + "The classes present in the config vs the ones found in the data - " + f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the " + "populate the 'alternative_class_names' attribute to accommodate for variations.") + category_value2id = copy.deepcopy(updated_category_value2id) + logger.info("Updated categoryvalue2id mapping - %s", category_value2id) + return category_value2id + + +def undersample_data(data,category_value2id,label_data_,config,): + """Undersamples the data for 2 phase learning + + Args: + data (Dict): + Output of `prepare_from_json`. + category_value2id(Dict): + Map from category_value to id. + label_data_: + Map that stores the number of samples for each label + config: + MetaCAT config + + Returns: + dict: + Undersampled data (for 2 phase learning) with integers inplace of strings for category values + """ + + + data_undersampled = [] + category_undersample = config.model.category_undersample + if category_undersample is None or category_undersample == '': + min_label = min(label_data_.values()) + + else: + if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys(): + min_label = label_data_[category_value2id[category_undersample]] + else: + min_label = label_data_[category_undersample] + + label_data_counter = {v: 0 for v in category_value2id.values()} + + for sample in data: + if label_data_counter[sample[-1]] < min_label: + data_undersampled.append(sample) + label_data_counter[sample[-1]] += 1 + + label_data = {v: 0 for v in category_value2id.values()} + for i in range(len(data_undersampled)): + if data_undersampled[i][2] in category_value2id.values(): + label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 + logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data) + return data_undersampled + def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None, alternative_class_names: List[List] = [], config=None) -> Tuple: """Converts the category values in the data outputted by `prepare_from_json` @@ -177,7 +270,9 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict Map from category value to ID for all categories in the data. Raises: - Exception: If categoryvalue2id is pre-defined and its labels do not match the labels found in the data + Exception: + If the number of classes in config do not match the number of classes found in the data + If category_value2id is pre-defined, its labels do not match the labels found in the data and alternative_class_names is empty """ data = list(data) if existing_category_value2id is not None: @@ -188,8 +283,10 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict category_values = set([x[2] for x in data]) if config: - if len(category_values)!=config.model.nclasses: - raise Exception("The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.",len(category_values),config.model.nclasses) + if len(category_values) != config.model.nclasses: + raise Exception( + "The number of classes found in the data - %s does not match the number of classes defined in the config - %s (config.model.nclasses). Please update the number of classes and initialise the model again.", + len(category_values), config.model.nclasses) # If categoryvalue2id is pre-defined or if all the classes aren't mentioned if len(category_value2id) != 0: @@ -203,30 +300,8 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict "The classes present in the config vs the ones found in the data - " f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the " "'alternative_class_names' attribute to accommodate for variations.") - updated_category_value2id = {} - for _class in category_value2id.keys(): - if _class in category_values: - updated_category_value2id[_class] = category_value2id[_class] - else: - found_in = [sub_map for sub_map in alternative_class_names if _class in sub_map] - failed_to_find = False - if len(found_in) != 0: - class_name_matched = [label for label in found_in[0] if label in category_values] - if len(class_name_matched) != 0: - updated_category_value2id[class_name_matched[0]] = category_value2id[_class] - logger.info("Class name '%s' does not exist in the data; however a variation of it " - "'%s' is present; updating it...", _class, class_name_matched[0]) - else: - failed_to_find = True - else: - failed_to_find = True - if failed_to_find: - raise Exception("The classes set in the config are not the same as the one found in the data. " - "The classes present in the config vs the ones found in the data - " - f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the " - "populate the 'alternative_class_names' attribute to accommodate for variations.") - category_value2id = copy.deepcopy(updated_category_value2id) - logger.info("Updated categoryvalue2id mapping - %s", category_value2id) + + category_value2id = alternative_class_names(category_value2id, category_values, alternative_class_names) # Else create the mapping from the labels found in the data if len(category_value2id) != len(category_values): @@ -249,29 +324,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict data_undersampled = [] if config and config.model.phase_number != 0: - # Undersampling data - category_undersample = config.model.category_undersample - if category_undersample is None or category_undersample == '': - min_label = min(label_data_.values()) - - else: - if category_undersample not in label_data_.keys() and category_undersample in category_value2id.keys(): - min_label = label_data_[category_value2id[category_undersample]] - else: - min_label = label_data_[category_undersample] - - label_data_counter = {v: 0 for v in category_value2id.values()} - - for sample in data: - if label_data_counter[sample[-1]] < min_label: - data_undersampled.append(sample) - label_data_counter[sample[-1]] += 1 - - label_data = {v: 0 for v in category_value2id.values()} - for i in range(len(data_undersampled)): - if data_undersampled[i][2] in category_value2id.values(): - label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 - logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data) + data_undersampled = undersample_data(data, category_value2id, label_data_, config) return data, data_undersampled, category_value2id From 25153c03ae7845d924a5358bfa7ef80054eb2056 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 1 Oct 2025 11:10:34 +0100 Subject: [PATCH 5/8] Update data_utils.py --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index 11c854b7b..7bad4bfe3 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -246,6 +246,7 @@ def undersample_data(data,category_value2id,label_data_,config,): logger.info("Updated number of samples per label (for 2-phase learning): %s", label_data) return data_undersampled + def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None, alternative_class_names: List[List] = [], config=None) -> Tuple: """Converts the category values in the data outputted by `prepare_from_json` @@ -301,7 +302,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict f"{set(category_value2id.keys())}, {category_values}. Additionally, ensure the populate the " "'alternative_class_names' attribute to accommodate for variations.") - category_value2id = alternative_class_names(category_value2id, category_values, alternative_class_names) + category_value2id = find_alternate_classname(category_value2id, category_values, alternative_class_names) # Else create the mapping from the labels found in the data if len(category_value2id) != len(category_values): From 5828cb25db806b58510e31773ec98198b8487358 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 1 Oct 2025 11:26:59 +0100 Subject: [PATCH 6/8] Update data_utils.py Changes for flake8 --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index 7bad4bfe3..e28bc228f 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -170,10 +170,9 @@ def find_alternate_classname(category_value2id, category_values, alternative_cla Updated category_value2id with keys corresponding to alternative class names Raises: - Exception: - If no alternatives are found for labels in category_value2id that don't match any of the labels in the data - If the alternatives defined for labels in category_value2id that don't match any of the labels in the data - """ + Exception: If no alternatives are found for labels in category_value2id that don't match any of the labels in the data + Exception: If the alternatives defined for labels in category_value2id that don't match any of the labels in the data + """ updated_category_value2id = {} for _class in category_value2id.keys(): @@ -220,7 +219,6 @@ def undersample_data(data,category_value2id,label_data_,config,): Undersampled data (for 2 phase learning) with integers inplace of strings for category values """ - data_undersampled = [] category_undersample = config.model.category_undersample if category_undersample is None or category_undersample == '': @@ -271,10 +269,10 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict Map from category value to ID for all categories in the data. Raises: - Exception: - If the number of classes in config do not match the number of classes found in the data - If category_value2id is pre-defined, its labels do not match the labels found in the data and alternative_class_names is empty + Exception: If the number of classes in config do not match the number of classes found in the data + Exception: If category_value2id is pre-defined, its labels do not match the labels found in the data and alternative_class_names is empty """ + data = list(data) if existing_category_value2id is not None: category_value2id = existing_category_value2id From ba6e71ddf241ff39663adcf062030393ae67ac8b Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:11:14 +0100 Subject: [PATCH 7/8] Update data_utils.py --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index e28bc228f..f8e5b66fb 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Tuple, Iterable, List, Union +from typing import Any, Dict, Optional, Tuple, Iterable, List, Union, Set from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase import copy import logging @@ -153,7 +153,7 @@ def prepare_for_oversampled_data(data: List, return data_sampled -def find_alternate_classname(category_value2id, category_values, alternative_class_names): +def find_alternate_classname(category_value2id: Dict, category_values: Set, alternative_class_names: List[List]) -> Dict: """Helper function to find and map to alternative class names for the given category. Example: For Temporality category, 'Recent' is an alternative to 'Present'. @@ -201,7 +201,7 @@ def find_alternate_classname(category_value2id, category_values, alternative_cla return category_value2id -def undersample_data(data,category_value2id,label_data_,config,): +def undersample_data(data: List, category_value2id: Dict, label_data_,config) -> List: """Undersamples the data for 2 phase learning Args: @@ -215,8 +215,8 @@ def undersample_data(data,category_value2id,label_data_,config,): MetaCAT config Returns: - dict: - Undersampled data (for 2 phase learning) with integers inplace of strings for category values + data_undersampled (list): + Return the data created for 2 phase learning) with integers inplace of strings for category values """ data_undersampled = [] @@ -261,11 +261,11 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict MetaCAT config Returns: - dict: + data (list): New data with integers inplace of strings for category values. - dict: + data_undersampled (list): New undersampled data (for 2 phase learning) with integers inplace of strings for category values - dict: + category_value2id (dict): Map from category value to ID for all categories in the data. Raises: From 5eaed0fa7155e7bb1b4b635d575109b3b38e27d2 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:22:05 +0100 Subject: [PATCH 8/8] Update data_utils.py --- v1/medcat/medcat/utils/meta_cat/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v1/medcat/medcat/utils/meta_cat/data_utils.py b/v1/medcat/medcat/utils/meta_cat/data_utils.py index f8e5b66fb..9272d1a2d 100644 --- a/v1/medcat/medcat/utils/meta_cat/data_utils.py +++ b/v1/medcat/medcat/utils/meta_cat/data_utils.py @@ -205,7 +205,7 @@ def undersample_data(data: List, category_value2id: Dict, label_data_,config) -> """Undersamples the data for 2 phase learning Args: - data (Dict): + data (List): Output of `prepare_from_json`. category_value2id(Dict): Map from category_value to id.