In [54]:
import re


entity_synonym_map = {
    'width': ['width'],
    'depth': ['depth'],
    'height': ['height'],
    'item_weight': ['weight'],
    'maximum_weight_recommendation': ['maximum weight', 'recommended weight'],
    'voltage': ['voltage'],
    'wattage': ['wattage'],
    'item_volume': ['volume', 'capacity'],
}


In [55]:
# def normalize_unit(unit):
#     unit = unit.lower().strip()
#     unit = re.split(r'[^\w\s]', unit)[0].strip() 
#     unit_replacements = {
#         'centimetre': 'centimetre', 'cm': 'centimetre',
#         'foot': 'foot', 'ft': 'foot',
#         'inch': 'inch', 'in': 'inch',
#         'metre': 'metre', 'm': 'metre',
#         'millimetre': 'millimetre', 'mm': 'millimetre',
#         'yard': 'yard', 'yd': 'yard',
#         'gram': 'gram', 'g': 'gram',
#         'kilogram': 'kilogram', 'kg': 'kilogram',
#         'microgram': 'microgram', 'µg': 'microgram',
#         'milligram': 'milligram', 'mg': 'milligram',
#         'ounce': 'ounce', 'oz': 'ounce',
#         'pound': 'pound', 'lb': 'pound',
#         'ton': 'ton', 't': 'ton',
#         'kilovolt': 'kilovolt', 'kv': 'kilovolt',
#         'millivolt': 'millivolt', 'mv': 'millivolt',
#         'volt': 'volt', 'v': 'volt',
#         'kilowatt': 'kilowatt', 'kw': 'kilowatt',
#         'watt': 'watt', 'w': 'watt',
#         'centilitre': 'centilitre', 'cl': 'centilitre',
#         'cubic foot': 'cubic foot', 'ft³': 'cubic foot',
#         'cubic inch': 'cubic inch', 'in³': 'cubic inch',
#         'cup': 'cup',
#         'decilitre': 'decilitre', 'dl': 'decilitre',
#         'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
#         'gallon': 'gallon', 'gal': 'gallon',
#         'imperial gallon': 'imperial gallon', 'imp gal': 'imperial gallon',
#         'litre': 'litre', 'l': 'litre',
#         'microlitre': 'microlitre', 'µl': 'microlitre',
#         'millilitre': 'millilitre', 'ml': 'millilitre',
#         'pint': 'pint', 'pt': 'pint',
#         'quart': 'quart', 'qt': 'quart'
#     }
#     return unit_replacements.get(unit, unit)

In [56]:
def extract_value_unit_with_context(entity_name, ocr_text, allowed_units, synonym_map):
    
    possible_synonyms = synonym_map.get(entity_name, [entity_name])

    
    for synonym in possible_synonyms:
        entity_pattern = re.escape(synonym) 
        entity_match = re.search(entity_pattern, ocr_text, re.IGNORECASE)

        if entity_match:
            
            ocr_text_after_entity = ocr_text[entity_match.end():]
            
           
            pattern = r'(\d*\.?\d+)\s*([a-zA-Z]+(?:\s+[a-zA-Z]+)*)'
            matches = re.findall(pattern, ocr_text_after_entity)

            print(f"Matches found after '{synonym}': {matches}") 

            for value, unit in matches:
                normalized_unit = normalize_unit(unit)
                print(f"Checking value: {value}, unit: {unit} (normalized: {normalized_unit})") 
                if normalized_unit in [u.lower() for u in allowed_units]:
                    return float(value), normalized_unit

    print(f"No mention of {entity_name} or its synonyms found in OCR text.") 
    return None, None  

In [57]:
def preprocess_entity_value(entity_name, ocr_text, entity_unit_map, synonym_map):
   
    allowed_units_for_entity = entity_unit_map.get(entity_name, set())
    print(f"Allowed units for {entity_name}: {allowed_units_for_entity}")  

    value, unit = extract_value_unit_with_context(entity_name, ocr_text, allowed_units_for_entity, synonym_map)

    if value and unit:
        return f"{value} {unit}"
    
    print(f"No valid value found for {entity_name}")
    return ""

In [62]:
ocr_text = """
The product dimensions are as follows: height is 15.75 inches, 
width measures 30.5 centimetre, and depth extends to 0.85 m. In terms of weight, 
the item weight around 1.2 kilograms, with a maximum weight recommendation of 2.5 tons. 
The electrical specifications include a voltage of 230 volts and a wattage of 500 watt. 
For volume, the item has a liquid capacity of volume 1.5 litre, or 1500 millilitres. Please ensure 
you do not exceed these recommended values for safe operation.
"""

In [63]:
entity_name = "item_volume"

In [64]:
entity_unit_map = {
    'width': {'centimetre', 'cm', 'centimeters', 'foot', 'ft', 'inch', 'in', 'inches', 'metre', 'm', 'meters', 'millimetre', 'mm', 'millimeters', 'yard', 'yd', 'yards'},
    'depth': {'centimetre', 'cm', 'centimeters', 'foot', 'ft', 'inch', 'in', 'inches', 'metre', 'm', 'meters', 'millimetre', 'mm', 'millimeters', 'yard', 'yd', 'yards'},
    'height': {'centimetre', 'cm', 'centimeters', 'foot', 'ft', 'inch', 'in', 'inches', 'metre', 'm', 'meters', 'millimetre', 'mm', 'millimeters', 'yard', 'yd', 'yards'},
    'item_weight': {'gram', 'g', 'grams', 'kilogram', 'kg', 'kilograms', 'microgram', 'µg', 'micrograms', 'milligram', 'mg', 'milligrams', 'ounce', 'oz', 'ounces', 'pound', 'lb', 'pounds', 'ton', 't', 'tons'},
    'maximum_weight_recommendation': {'gram', 'g', 'grams', 'kilogram', 'kg', 'kilograms', 'microgram', 'µg', 'micrograms', 'milligram', 'mg', 'milligrams', 'ounce', 'oz', 'ounces', 'pound', 'lb', 'pounds', 'ton', 't', 'tons'},
    'voltage': {'kilovolt', 'kV', 'kilovolts', 'millivolt', 'mV', 'millivolts', 'volt', 'V', 'volts'},
    'wattage': {'kilowatt', 'kW', 'kilowatts', 'watt', 'W', 'watts'},
    'item_volume': {'centilitre', 'cL', 'centilitres', 'cubic foot', 'ft³', 'cubic feet', 'cubic inch', 'in³', 'cubic inches', 'cup', 'cups', 'decilitre', 'dL', 'decilitres', 'fluid ounce', 'fl oz', 'fluid ounces', 'gallon', 'gal', 'gallons', 
                    'imperial gallon', 'imp gal', 'imperial gallons', 'litre', 'L', 'litres', 'microlitre', 'µL', 'microlitres', 'millilitre', 'mL', 'millilitres', 'pint', 'pt', 'pints', 'quart', 'qt', 'quarts'}
}


In [65]:
preprocessed_value = preprocess_entity_value(entity_name, ocr_text, entity_unit_map, entity_synonym_map)

print(f"Preprocessed value for '{entity_name}': {preprocessed_value}")

Allowed units for item_volume: {'microlitre', 'in³', 'cubic feet', 'cubic inches', 'L', 'pint', 'dL', 'centilitres', 'quart', 'cubic inch', 'ft³', 'litre', 'litres', 'pints', 'cups', 'imp gal', 'mL', 'gallon', 'fluid ounces', 'quarts', 'pt', 'qt', 'fl oz', 'cup', 'fluid ounce', 'microlitres', 'gal', 'centilitre', 'decilitres', 'millilitres', 'imperial gallons', 'µL', 'cL', 'cubic foot', 'imperial gallon', 'decilitre', 'millilitre', 'gallons'}
Matches found after 'volume': [('1.5', 'litre'), ('1500', 'millilitres')]
Checking value: 1.5, unit: litre (normalized: litre)
Preprocessed value for 'item_volume': 1.5 litre
