<a href="https://colab.research.google.com/github/DilliKafley/Entropy/blob/main/ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import necessary packages
import json
import re

def process_ocr(json_path, ocr_path):
    def convert_json_to_dict(json_file_path):
        # Load JSON data from file
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        # Initialize an empty dictionary to store the result
        result_dict = {}

        # Iterate through each item in the JSON data
        for item in data:
            abbreviation = item['Abbreviation']
            synonyms = item['Synonyms']

            # Check if the abbreviation is already in the result dictionary
            if abbreviation in result_dict:
                # If it is, extend the list of synonyms
                result_dict[abbreviation].extend(synonyms)
            else:
                # If not, add the abbreviation as a key and the list of synonyms as its value
                result_dict[abbreviation] = synonyms

        return result_dict

    def extracted_list(ocr_path):
        with open(ocr_path, 'r') as file:
            data = file.read()
            pattern = r'.*\d+.*'
            my_list = re.findall(pattern, data)
        return my_list

    def extract_items(converted_dict, extracted_list):
        extracted_items = []
        for item in extracted_list:
            item_lower = item.lower()
            for key, value in converted_dict.items():
                key_lower = key.lower()
                # Check if the key or any value is a whole word in the sentence
                pattern = r'\b{}\b'.format(re.escape(key_lower))
                if (re.search(pattern, item_lower) or
                    any(re.search(r'\b{}\b'.format(re.escape(val.lower())), item_lower) for val in value)):
                    extracted_items.append(item)
                    break
        return extracted_items

    def replace_words_in_list(extracted_items, converted_dict):
        replaced_list = []
        for sentence in extracted_items:
            words = sentence.split()
            new_words = []
            for word in words:
                if word in converted_dict:  # If word is a key in the dictionary
                    new_word = converted_dict[word][0]  # Replace with the first element of the corresponding key's value
                else:
                    new_word = word
                    for key, values in converted_dict.items():
                        if word in values[1:]:  # Check if word is in any value other than the first one
                            new_word = values[0]  # Replace with the first element of the corresponding key's value
                            break

                new_words.append(new_word)
            new_sentence = ' '.join(new_words)
            replaced_list.append(new_sentence)
        return replaced_list
    def convert_to_dict(data):

      output = []
      for item in data:

        # Search for parameter, value, and unit patterns
          match = re.search(r"(.+)\s+(\d+)\s+([\w\s\/*\-]+)\b", item)
          if match:

              parameter = match.group(1).strip()  # Capture everything before the numbers
              value = float(match.group(2))
              unit = match.group(3).strip()
              output.append({'parameter': parameter, 'value': value, 'unit': unit})
      return output


    def handle_units(data):
      output = []
      for item in data:
        parameter, value, unit = item['parameter'], item['value'], item['unit']

        # Remove digits from parameter
        parameter = re.sub(r'\d+', '', parameter).strip()

        # Handle unit cases
        if unit.isdigit():  # Check if unit contains only digits
            unit = 'NA'
        else:
            unit = unit.strip()  # Keep unit as string

        output.append({'parameter': parameter, 'value': value, 'unit': unit})
      return output





    # Call inner functions
    converted_dict = convert_json_to_dict(json_path)
    items_list = extracted_list(ocr_path)
    extracted_items = extract_items(converted_dict, items_list)
    replaced_list = replace_words_in_list(extracted_items, converted_dict)

    pattern = re.compile(r'\bnil\b', re.IGNORECASE) # some measurements are nil so replaced nil with 0

    replaced_list = [re.sub(pattern,'0',text) for text in replaced_list]
    replaced_list = [re.sub(r'\(.*?\d+.*?\)','',text) for text in replaced_list]
    final_dictionary = convert_to_dict(replaced_list)
    units_handled = handle_units(final_dictionary)

    return units_handled

# Demo


In [4]:
demo = process_ocr('/content/sample_data/X1.json','/content/sample_data/ocr1.txt')
demo

[{'parameter': 'pH  Protein', 'value': 0.0, 'unit': 'Glucose 0'},
 {'parameter': 'Red Blood Cell Count H', 'value': 14.0, 'unit': 'x10*6/L'},
 {'parameter': 'Rubella gamma globulin (CMIA)',
  'value': 11.0,
  'unit': 'IU/mL'},
 {'parameter': 'Sodium  L', 'value': 140.0, 'unit': 'mmol/L'},
 {'parameter': 'Chloride', 'value': 106.0, 'unit': 'mmol/L'},
 {'parameter': 'carbon dioxide', 'value': 25.0, 'unit': 'mmol/L'},
 {'parameter': 'creatine', 'value': 65.0, 'unit': 'umol/L'},
 {'parameter': 'Bili.Total  H  H', 'value': 18.0, 'unit': 'umol/L'},
 {'parameter': 'alkaline phosphatase', 'value': 70.0, 'unit': 'U/L'},
 {'parameter': 'gamma-glutamyltransferase', 'value': 15.0, 'unit': 'U/L'},
 {'parameter': 'aspartate transaminase', 'value': 23.0, 'unit': 'U/L'},
 {'parameter': 'alanine transaminase', 'value': 22.0, 'unit': 'U/L'},
 {'parameter': 'Total Protein', 'value': 72.0, 'unit': 'g/L'},
 {'parameter': 'Albumin', 'value': 46.0, 'unit': 'g/L'},
 {'parameter': 'Ferritin', 'value': 47.0, 'u