In [2]:
import pandas as pd
import easyocr
import requests
from io import BytesIO
from PIL import Image
import re

# Load the CSV dataset
csv_file_path = 'train.csv'  # Replace with the actual file path
data = pd.read_csv(csv_file_path)

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])

# Unified dictionary for unit conversion including all entity types
unit_conversion_common = {
    # Height, width, depth units
    r'\bcm\b': 'centimetre',
    r'\bcentimetres?\b': 'centimetre',
    r'\bmm\b': 'millimetre',
    r'\bmillimetres?\b': 'millimetre',
    r'\bm\b': 'metre',
    r'\bmetres?\b': 'metre',
    r'\bin\b': 'inch',
    r'\binches\b': 'inch',
    r'\bft\b': 'foot',
    r'\bfeet\b': 'foot',
    r'\byd\b': 'yard',
    r'\byards?\b': 'yard',

    # Weight and maximum weight recommendation units
    r'\bg\b': 'gram',
    r'\bgrams?\b': 'gram',
    r'\bkg\b': 'kilogram',
    r'\bkilograms?\b': 'kilogram',
    r'\bmicrograms?\b': 'microgram',
    r'\bmg\b': 'milligram',
    r'\bmilligrams?\b': 'milligram',
    r'\blb\b': 'pound',
    r'\blbs?\b': 'pound',
    r'\bpounds?\b': 'pound',
    r'\bton\b': 'ton',
    r'\btons?\b': 'ton',

    # Volume units
    r'\bml\b': 'millilitre',
    r'\bmillilitres?\b': 'millilitre',
    r'\bl\b': 'litre',
    r'\blitres?\b': 'litre',
    r'\bcl\b': 'centilitre',
    r'\bcentilitres?\b': 'centilitre',
    r'\bfl oz\b': 'fluid ounce',
    r'\bcup\b': 'cup',
    r'\bgallon\b': 'gallon',
    r'\bgallons?\b': 'gallon',
    r'\bdecilitre\b': 'decilitre',
    r'\bpint\b': 'pint',
    r'\bquart\b': 'quart',

    # Voltage units
    r'\bkv\b': 'kilovolt',
    r'\bkilovolts?\b': 'kilovolt',
    r'\bmv\b': 'millivolt',
    r'\bmillivolts?\b': 'millivolt',
    r'\bv\b': 'volt',
    r'\bvolts?\b': 'volt',

    # Wattage units
    r'\bkw\b': 'kilowatt',
    r'\bkilowatts?\b': 'kilowatt',
    r'\bw\b': 'watt',
    r'\bwatts?\b': 'watt'
}

# Regex patterns for different entity types
height_width_depth_regex = r'(\d+\.?\d*)\s*(cm|centimetres?|mm|millimetres?|m|metres?|in|inches|ft|feet|yd|yards?)'
item_weight_regex = r'(\d+\.?\d*)\s*(g|gram|grams?|kg|kilogram|kilograms?|microgram|micrograms?|mg|milligram|milligrams?|lb|lbs?|pound|pounds?|ton|tons?)'
item_volume_regex = r'(\d+\.?\d*)\s*(ml|millilitre|millilitres?|l|litre|litres?|cl|centilitre|centilitres?|fl oz|fluid ounce|cup|gallon|gallons?|decilitre|pint|quart)'
voltage_regex = r'(\d+\.?\d*)\s*(kv|kilovolts?|mv|millivolts?|v|volts?)'
wattage_regex = r'(\d+\.?\d*)\s*(kw|kilowatts?|w|watts?)'
maximum_weight_recommendation_regex = item_weight_regex  # Same regex as item_weight

# Function to perform OCR on image from URL
def ocr_from_url(image_url):
    try:
        # Fetch the image from the URL
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))
        
        # Perform OCR and join all detected text into a single string
        ocr_result = reader.readtext(img, detail=0)
        return ' '.join(ocr_result)
    except Exception as e:
        return f"Error processing {image_url}: {str(e)}"

# Function to extract the first occurrence of a numeric value and its unit
def extract_first_unit_with_value(text, entity_name):
    # Convert the entire text to lowercase
    text = text.lower()

    # Select the appropriate regex pattern based on the entity name
    if entity_name in {'width', 'depth', 'height'}:
        pattern = re.compile(height_width_depth_regex)
    elif entity_name == 'item_weight':
        pattern = re.compile(item_weight_regex)
    elif entity_name == 'maximum_weight_recommendation':
        pattern = re.compile(maximum_weight_recommendation_regex)
    elif entity_name == 'voltage':
        pattern = re.compile(voltage_regex)
    elif entity_name == 'wattage':
        pattern = re.compile(wattage_regex)
    elif entity_name == 'item_volume':
        pattern = re.compile(item_volume_regex)
    else:
        return ''  # Return an empty string if entity_name is not recognized

    # Function to replace the unit abbreviation with the full form while keeping the numeric value
    def replace_units(match):
        value = float(match.group(1))  # Convert the numeric value to float
        unit = match.group(2)  # The unit abbreviation or full name
        for abbrev, full in unit_conversion_common.items():
            if re.fullmatch(abbrev, unit):
                return f"{value} {full}"
        return f"{value} {unit}"  # Return original if no match

    # Find the first occurrence
    #Search Complexity Issue
    match = pattern.search(text)
    
    # If a match is found, return the formatted numeric value and unit
    if match:
        return replace_units(match)
    
    return ''  # Return an empty string if no match is found

# Limit to processing 15 images
image_count = 100

# List to store results
results = []

# Apply OCR and regex extraction to each image URL in the dataset with specific entity names (up to 15)
#Time complexity issue 
processed_count = 0
for index, row in data.iterrows():
    if processed_count >= image_count:
        break  # Stop after processing 15 images
    
    entity_name = row['entity_name'].strip().lower()
    if entity_name in ['width', 'height', 'depth', 'item_weight', 'maximum_weight_recommendation', 'voltage', 'wattage', 'item_volume']:
        image_url = row['image_link']
        group_id = row['group_id']
        
        ocr_text = ocr_from_url(image_url)
        
        if "Error" not in ocr_text:
            # Extract the first occurrence of a unit and its numeric value
            extracted_text = extract_first_unit_with_value(ocr_text, entity_name)
            results.append({
                'image_url': image_url,
                'group_id': group_id,
                'extracted_text': extracted_text
            })
            processed_count += 1  # Increment the processed count
        else:
            print(f"OCR for {image_url} failed: {ocr_text}")

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Rename 'extracted_text' column to 'entity_value'
results_df.rename(columns={'extracted_text': 'entity_value'}, inplace=True)

# Save the results to a new CSV file
results_df.to_csv('processed_results.csv', index=False)

print("Processing complete. Results saved to 'processed_results.csv'.")

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


OCR for https://m.media-amazon.com/images/I/71FVeRd2jqL.jpg failed: Error processing https://m.media-amazon.com/images/I/71FVeRd2jqL.jpg: MPS backend out of memory (MPS allocated: 2.61 GB, other allocations: 5.37 GB, max allowed: 9.07 GB). Tried to allocate 1.52 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).
OCR for https://m.media-amazon.com/images/I/71duwM3SjpL.jpg failed: Error processing https://m.media-amazon.com/images/I/71duwM3SjpL.jpg: MPS backend out of memory (MPS allocated: 2.09 GB, other allocations: 6.11 GB, max allowed: 9.07 GB). Tried to allocate 1.17 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).
OCR for https://m.media-amazon.com/images/I/81lgxfKqUUL.jpg failed: Error processing https://m.media-amazon.com/images/I/81lgxfKqUUL.jpg: MPS backend out of memory (MPS allocated: 2.09 GB, other allocatio