In [2]:
import pandas as pd
from pathlib import Path
import re
import easyocr

In [3]:
unit_conversion_map = {
    'cm': 'centimetre',
    'ft': 'foot',
    'in': 'inch',
    'm': 'metre',
    'mm': 'millimetre',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'ug': 'microgram',
    'mg': 'milligram',
    'oz': 'ounce',
    'lb': 'pound',
    'ton': 'ton',
    'kv': 'kilovolt',
    'mv': 'millivolt',
    'v': 'volt',
    'w': 'watt',
    'kw': 'kilowatt',
    'cl': 'centilitre',
    'cu_ft': 'cubic foot',
    'cu_in': 'cubic inch',
    'cup': 'cup',
    'dl': 'decilitre',
    'fl_oz': 'fluid ounce',
    'gal': 'gallon',
    'imp_gal': 'imperial gallon',
    'l': 'litre',
    'ul': 'microlitre',
    'ml': 'millilitre',
    'pt': 'pint',
    'qt': 'quart',
    'h': 'hour'
}

# Function to extract text from an image
def extract_text_from_image(image_path, use_cuda=False):
    reader = easyocr.Reader(['en'], gpu=use_cuda)
    result = reader.readtext(image_path)
    return result
# Function to clean and convert the extracted text into a standardized format
def clean_extracted_text(extracted_text):
    cleaned_data = []
    single_number_unit_pattern = r'(\d+(\.\d+)?|\d+,\d+)\s*(cm|ft|in|mm|m|yd|g|kg|ug|mg|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|h)'
    range_pattern = r'(\d+(\.\d+)?|\d+,\d+)\s*(cm|ft|in|mm|m|yd|g|kg|ug|mg|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|h)\s*to\s*(\d+(\.\d+)?|\d+,\d+)\s*(cm|ft|in|mm|m|yd|g|kg|ug|mg|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|h)'
    multiple_numbers_pattern = r'((\d+(\.\d+)?|\d+,\d+)(,\s*\d+(\.\d+)?|\d+,\d+)*?)\s*(cm|ft|in|mm|m|yd|g|kg|ug|mg|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|h)'
    bracketed_range_pattern = r'\[\s*(\d+(\.\d+)?|\d+,\d+)\s*,\s*(\d+(\.\d+)?|\d+,\d+)\s*\]\s*(cm|ft|in|mm|m|yd|g|kg|ug|mg|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|h)'

    for text in extracted_text:
        match = re.match(range_pattern, text[1])
        if match:
            cleaned_data.append((float(match.group(1).replace(',', '.')), match.group(3)))
            cleaned_data.append((float(match.group(4).replace(',', '.')), match.group(6)))
        else:
            match = re.match(single_number_unit_pattern, text[1])
            if match:
                cleaned_data.append((float(match.group(1).replace(',', '.')), match.group(3)))
            else:
                match = re.match(multiple_numbers_pattern, text[1])
                if match:
                    numbers = match.group(1).split(',')
                    for number in numbers:
                        cleaned_data.append((float(number.strip().replace(',', '.')), match.group(6)))
                else:
                    match = re.match(bracketed_range_pattern, text[1])
                    if match:
                        cleaned_data.append((float(match.group(1).replace(',', '.')), match.group(5)))
                        cleaned_data.append((float(match.group(3).replace(',', '.')), match.group(5)))
    return cleaned_data

# Function to map shorthand notations to original entity values
def map_units(cleaned_data):
    allowed_units = set(unit_conversion_map.values())
    mapped_data = []
    for number, unit in cleaned_data:
        if unit in unit_conversion_map:
            mapped_unit = unit_conversion_map[unit]
            if mapped_unit in allowed_units:
                mapped_data.append((number, mapped_unit))
    return mapped_data


In [4]:

df = pd.read_csv(r"dataset/train1.csv")
# print(df.columns)

df

Unnamed: 0.1,Unnamed: 0,image_link,group_id,entity_name,entity_value
0,0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...,...
263854,263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [5]:
df.rename(columns={"Unnamed: 0": "index"},inplace=True)
df = df[["index","entity_value","image_link"]]
(df.iloc[1]["image_link"].split("/")[-1])

'71gSRbyXmoL.jpg'

In [81]:
download_folder = Path('downloads')
image_paths = list(download_folder.glob('*.jpg'))
print(str(image_paths[0]).split("\\")[-1])

31EvJszFVfL.jpg


In [82]:
import pandas as pd
from pathlib import Path

# Step 1: Read your DataFrame
df = pd.read_csv(r"dataset/train1.csv")
df.rename(columns={"Unnamed: 0": "index"}, inplace=True)
df = df[["index", "entity_value", "image_link"]]

# Step 2: Extract image names from the DataFrame (from 'image_link')
df['image_name'] = df['image_link'].apply(lambda x: x.split("/")[-1])

# Step 3: Get the list of image paths from the folder
download_folder = Path('downloads')
image_paths = list(download_folder.glob('*.jpg'))

# Step 4: Extract the image names from the folder paths
folder_image_names = {str(image_path).split("\\")[-1]: str(image_path) for image_path in image_paths}

# Step 5: Map the DataFrame image names to the corresponding image paths
df['image_path'] = df['image_name'].map(folder_image_names)

# Step 6: Check the result
df = df[['image_name', 'image_path','entity_value']]
df

Unnamed: 0,image_name,image_path,entity_value
0,61I9XdN6OFL.jpg,downloads\61I9XdN6OFL.jpg,500.0 gram
1,71gSRbyXmoL.jpg,downloads\71gSRbyXmoL.jpg,1.0 cup
2,61BZ4zrjZXL.jpg,downloads\61BZ4zrjZXL.jpg,0.709 gram
3,612mrlqiI4L.jpg,downloads\612mrlqiI4L.jpg,0.709 gram
4,617Tl40LOXL.jpg,downloads\617Tl40LOXL.jpg,1400 milligram
...,...,...,...
263854,612J1R1xHlL.jpg,,5.0 centimetre
263855,61Blzh2+28L.jpg,,8.5 inch
263856,51MsegDL9VL.jpg,,43.2 centimetre
263857,510KhVw4VSL.jpg,,9.1 centimetre


In [83]:
import torch
def process_images(df):
    extracted_data = []

    i =0
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        if i == 3:
            return df
        image_path = row['image_path']
        
        if pd.notna(image_path):  # Check if image path exists
            # Step 2: Perform OCR on the image and clean the text
            extracted_text = extract_text_from_image(str(image_path), use_cuda=True)
            # print(extracted_text)
            cleaned_text = clean_extracted_text(extracted_text)
            mapped_text = map_units(cleaned_text)
            
            # torch.cuda.empty_cache()
            # torch.cuda.synchronize()
            
            # Step 3: Append the cleaned/mapped text to the list
            extracted_data.append(mapped_text)

    # Step 4: Add the extracted data as a new column in the DataFrame
    df['extracted_text'] = extracted_data
    i+=1

    return df

# Step 5: Call the function and store the results
df = process_images(df)

# Step 6: Check the updated DataFrame with the new OCR column
df[['image_name', 'image_path', 'extracted_text']].head(30)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


KeyboardInterrupt: 