In [118]:
import re

def extract_measurements(texts, entity):
    # Define regex patterns for different entities, including optional plural forms and handling trailing characters
    patterns = {
        'item_weight': r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:kgs?|kilo(?:gram)?s?|micro(?:gram)?s?|mgs?|mil?li(?:gram)?s?|gs?|gms?|grms?|grams?|kilos?|lbs?|pounds?|tons?|tns?|ozs?|ounces?)\b',
        'maximum_weight_recommendation': r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:kgs?|kilo(?:gram)?s?|micro(?:gram)?s?|mgs?|mil?li(?:gram)?s?|gs?|gms?|grms?|grams?|kilos?|lbs?|pounds?|tons?|tns?|ozs?|ounces?)\b',
        
        'height': r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:cms?|centimeters?|centimetres?|m|meters?|metres?|in.?|inches|ft(?:s)?|feet|foot|yards?|yds?|mil?limeters?|mil?limetrers?|mms?|\' |\" )\b',
        'depth':  r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:cms?|centimeters?|centimetres?|m|meters?|metres?|in.?|inches|inch|ft.?|fts?|feet|foot|yards?|yds?|mil?limeters?|mil?limetrers?|mms?|\' |\" )\b',
        'width':  r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:cms?|centimeters?|centimetres?|m|meters?|metres?|in.?|inches|inch|ft.?|fts?|feet|foot|yards?|yds?|mil?limeters?|mil?limetrers?|mms?|\' |\" )\b',
        
        'voltage': r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:Vs?|volts?|kVs?|mvs?|kilovolts?|mil?livolts?)?',
        'wattage': r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:Ws?|watts?|kWs?|kilowatts?)?',
        
        'item_volume': r'(?:[0-9]+(?:\.[0-9]+)?)\s*(?:mls?|milliliters?|millilitres?|Ls?|liters?|litres?|lts?|cups?|gallons?|imperial ?gallons?|fl ?ozs?|fluid ?ounces?|fl ?ounces?|pints?|quarts?|mill?ilitres?|mill?iliters?|mls?|centilitres?|centileterS?|microlitres?|microliters?|deciliters?|decilitres?|cubic ?foot|cubic ?feet|cubic inches|cubic inch)?'
    }

    # Select the pattern based on entity type
    pattern = patterns.get(entity.lower())
    if not pattern:
        return [] 
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    result = []

    # Process each text in the array
    for text in texts:        
        cleaned_text = re.sub(r'[^a-zA-Z0-9\'. ]', ' ', text)

        matches = compiled_pattern.findall(cleaned_text)

        valid_matches = [
            match for match in matches 
            if len(match) > 1 and 
               any(char.isdigit() for char in match) and 
               any(char.isalpha() or char == '\'' for char in match)
        ]
        result.extend(valid_matches)
    return result

In [141]:
import re

# Entity unit map as provided
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Create a reverse lookup dictionary for unit normalization
unit_normalization = {}
for key, units in entity_unit_map.items():
    for unit in units:
        unit_normalization[unit.lower()] = unit.lower()
        unit_normalization[unit.lower()+ 's'] = unit.lower()




# Include specific conversions
unit_normalization.update({
    # Weight units
    'lbs': 'pound',
    'lb': 'pound',
    'pounds': 'pound',
    'oz': 'ounce',
    'ozs': 'ounce',
    'ounces': 'ounce',
    'ton': 'ton',
    'tons': 'ton',
    'g':'gram',
    
    # Length/Dimension units
    'cm': 'centimetre',
    'cms': 'centimetre',
    'centimeters': 'centimetre',
    'centimetres': 'centimetre',
    'm': 'metre',
    'meters': 'metre',
    'metres': 'metre',
    'mm': 'millimetre',
    'mms': 'millimetre',
    'millimeters': 'millimetre',
    'millimetres': 'millimetre',
    'ft': 'foot',
    'ft.': 'foot',
    'fts': 'foot',
    'feet': 'foot',
    'foot': 'foot',
    'in': 'inch',
    'in.': 'inch',
    'inches': 'inch',
    '"': 'inch',   # apostrophe to represent inches
    '\'': 'inch',   # double quote to represent inches
    'yd': 'yard',
    'yds': 'yard',
    'yards': 'yard',
    
    # Voltage units
    'v': 'volt',
    'vs': 'volt',
    'volts': 'volt',
    'kv': 'kilovolt',
    'kvs': 'kilovolt',
    'mv': 'millivolt',
    'mvs': 'millivolt',
    
    # Wattage units
    'w': 'watt',
    'ws': 'watt',
    'watts': 'watt',
    'kw': 'kilowatt',
    'kws': 'kilowatt',
    
    # Volume units
    'ml': 'millilitre',
    'mls': 'millilitre',
    'milliliters': 'millilitre',
    'millilitres': 'millilitre',
    'l': 'litre',
    'liters': 'litre',
    'litres': 'litre',
    'lts': 'litre',
    'centilitres': 'centilitre',
    'centiliters': 'centilitre',
    'decilitres': 'decilitre',
    'deciliters': 'decilitre',
    'microlitres': 'microlitre',
    'microliters': 'microlitre',
    'fl oz': 'fluid ounce',
    'fl. oz': 'fluid ounce',
    'fluid ounces': 'fluid ounce',
    'cups': 'cup',
    'gallons': 'gallon',
    'imperial gallons': 'imperial gallon',
    'pints': 'pint',
    'quarts': 'quart',
    'cubic foot': 'cubic foot',
    'cubic feet': 'cubic foot',
    'cubic inch': 'cubic inch',
    'cubic inches': 'cubic inch'
})


def normalize_entity(unit):
    # Normalize the entity to its canonical form
    unit = unit.lower()
    return unit_normalization.get(unit, unit)

def format_extracted_value(extracted_value):
    if not extracted_value:
        return ""
    
    # Regex to extract number and unit
    match = re.match(r'(\d+(?:\.\d+)?)\s*(.*)', extracted_value)
    if not match:
        return ""
    
    number, unit = match.groups()
    normalized_unit = normalize_entity(unit.strip())
    
    return f"{number} {normalized_unit}"

# Example usage
extracted_values = [
    "12 ft ", "5.5inches", "3.2 gallon", "0.75 lbs", "1.5 g", "2'", "3'"
]

formatted_values = [format_extracted_value(val) for val in extracted_values]
print(formatted_values)


['12 foot', '5.5 inch', '3.2 gallon', '0.75 pound', '1.5 gram', '2 inch', '3 inch']


In [130]:
!pip install keras-ocr

Defaulting to user installation because normal site-packages is not writeable
Collecting keras-ocr
  Downloading keras_ocr-0.9.3-py3-none-any.whl (42 kB)
     -------------------------------------- 42.3/42.3 kB 682.4 kB/s eta 0:00:00
Collecting shapely
  Downloading shapely-2.0.6-cp39-cp39-win_amd64.whl (1.4 MB)
     ---------------------------------------- 1.4/1.4 MB 841.3 kB/s eta 0:00:00
Collecting imgaug
  Downloading imgaug-0.4.0-py2.py3-none-any.whl (948 kB)
     -------------------------------------- 948.0/948.0 kB 1.4 MB/s eta 0:00:00
Collecting essential_generators
  Downloading essential_generators-1.0-py3-none-any.whl (9.5 MB)
     ---------------------------------------- 9.5/9.5 MB 1.7 MB/s eta 0:00:00
Collecting editdistance
  Downloading editdistance-0.8.1-cp39-cp39-win_amd64.whl (79 kB)
     -------------------------------------- 79.6/79.6 kB 100.8 kB/s eta 0:00:00
Collecting efficientnet==1.0.0
  Downloading efficientnet-1.0.0-py3-none-any.whl (17 kB)
Collecting validat

In [132]:
import keras_ocr
# Create a pipeline for OCR
pipeline = keras_ocr.pipeline.Pipeline()

# Read and process the image (replace 'path_to_image.jpg' with your image path)
image = keras_ocr.tools.read('https://m.media-amazon.com/images/I/110EibNyclL.jpg')

# Extract text from the image
predictions = pipeline.recognize([image])[0]

# Print only the extracted text
extracted_text = [text for text, _ in predictions]
print(extracted_text)





  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (



Looking for C:\Users\hp\.keras-ocr\craft_mlt_25k.h5
Downloading C:\Users\hp\.keras-ocr\craft_mlt_25k.h5

Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.

Looking for C:\Users\hp\.keras-ocr\crnn_kurapan.h5
Downloading C:\Users\hp\.keras-ocr\crnn_kurapan.h5
['2n', 'cbcn', 's', 'jeln', 'iaaon', 'ission']


In [None]:
ans=[]
if entity=="height":
    ans.extend(dic[90].values)
    ans.extend(dic[270].values)
    ans.extend(dic[0].values)
    ans.extend(dic[180].values)
else: ans.exted(dic.values)

In [137]:
unit_normalization

{'metre': 'metre',
 'metres': 'metre',
 'inch': 'inch',
 'inchs': 'inch',
 'centimetre': 'centimetre',
 'centimetres': 'centimetre',
 'foot': 'foot',
 'foots': 'foot',
 'millimetre': 'millimetre',
 'millimetres': 'millimetre',
 'yard': 'yard',
 'yards': 'yard',
 'milligram': 'milligram',
 'milligrams': 'milligram',
 'microgram': 'microgram',
 'micrograms': 'microgram',
 'kilogram': 'kilogram',
 'kilograms': 'kilogram',
 'ton': 'ton',
 'tons': 'ton',
 'ounce': 'ounce',
 'ounces': 'ounce',
 'pound': 'pound',
 'pounds': 'pound',
 'gram': 'gram',
 'grams': 'gram',
 'millivolt': 'millivolt',
 'millivolts': 'millivolt',
 'volt': 'volt',
 'volts': 'volt',
 'kilovolt': 'kilovolt',
 'kilovolts': 'kilovolt',
 'watt': 'watt',
 'watts': 'watt',
 'kilowatt': 'kilowatt',
 'kilowatts': 'kilowatt',
 'cubic inch': 'cubic inch',
 'cubic inchs': 'cubic inch',
 'pint': 'pint',
 'pints': 'pint',
 'litre': 'litre',
 'litres': 'litre',
 'cup': 'cup',
 'cups': 'cup',
 'cubic foot': 'cubic foot',
 'cubic foots