In [5]:
import os
import pandas as pd
import cv2
import numpy as np
import easyocr
import spacy
import re
from tqdm import tqdm
import torch
from collections import defaultdict

In [6]:
print("CUDA Available:", torch.cuda.is_available())

# Check GPU details
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

CUDA Available: True
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


### ModelX

* Constants

In [7]:
entity_unit_map = {
    'width': {'centimetre': 'cm', 'foot': 'ft', 'inch': 'in', 'metre': 'm', 'millimetre': 'mm', 'yard': 'yd'},
    'depth': {'centimetre': 'cm', 'foot': 'ft', 'inch': 'in', 'metre': 'm', 'millimetre': 'mm', 'yard': 'yd'},
    'height': {'centimetre': 'cm', 'foot': 'ft', 'inch': 'in', 'metre': 'm', 'millimetre': 'mm', 'yard': 'yd'},
    'item_weight': {'gram': 'g', 'kilogram': 'kg', 'microgram': 'µg', 'milligram': 'mg', 'ounce': 'oz', 'pound': 'lb', 'ton': 't'},
    'maximum_weight_recommendation': {'gram': 'g', 'kilogram': 'kg', 'microgram': 'µg', 'milligram': 'mg', 'ounce': 'oz', 'pound': 'lb', 'ton': 't'},
    'voltage': {'kilovolt': 'kV', 'millivolt': 'mV', 'volt': 'V'},
    'wattage': {'kilowatt': 'kW', 'watt': 'W'},
    'item_volume': {'centilitre': 'cl', 'cubic foot': 'ft³', 'cubic inch': 'in³', 'cup': 'cup', 'decilitre': 'dl', 'fluid ounce': 'fl oz', 'gallon': 'gal', 'litre': 'L'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]} | \
                {abbr for entity in entity_unit_map for abbr in entity_unit_map[entity].values()}
allowed_units = allowed_units | {unit + 's' for unit in allowed_units}  # Add plural versions

* ModelX scripts

In [8]:
# Initialize EasyOCR Reader and spaCy
reader = easyocr.Reader(['en'], gpu=True)  # Set gpu=True if using GPU
nlp = spacy.load("en_core_web_sm")

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [9]:
def perform_ocr(image_path):
    image_cv = cv2.imread(image_path)  # Read the image with OpenCV
    result = reader.readtext(image_cv)  # Perform OCR
    extracted_text = ' '.join([text for (bbox, text, prob) in result])  # Extract and combine all detected text
    return extracted_text

In [10]:
def detect_units(extracted_text, entity_type, entity_unit_map, allowed_units):
    pattern = re.compile(r'(\d+\.?\d*)\s*([a-zA-Zµ]+)', re.IGNORECASE)
    doc = nlp(extracted_text)
    
    matches = []
    for sent in doc.sents:
        for match in re.findall(pattern, sent.text):
            number, unit = match
            number = float(number)  # Convert the number part to float for comparison
            unit = unit.lower()  # Normalize the unit
            
            if unit in allowed_units:
                for full_unit, abbreviation in entity_unit_map[entity_type].items():
                    if unit == abbreviation or (unit.endswith('s') and unit[:-1] == abbreviation):  # Handle plurals
                        matches.append((number, full_unit))  # Store as tuple (number, unit)
    
    if matches:
        largest_match = max(matches, key=lambda x: x[0])  # Find the tuple with the greatest number
        return f"{largest_match[0]} {largest_match[1]}"
    else:
        return np.nan

### Read CSV

In [11]:
df = pd.read_csv('test1.csv')

In [12]:
df.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [13]:
df.shape

(43729, 4)

### Batch processing

In [16]:
freq = defaultdict(tuple)

def batch_process(df, batch_size):
    results = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i: i+batch_size]

        for idx, row in batch.iterrows():
            image_name = str(row['index']) + '.jpg'
            image_path = os.path.join('images', image_name)

            if os.path.exists(image_path):
                extracted_text = perform_ocr(image_path)
                entity_name = row['entity_name']
                group_id = row['group_id']
                # result
                result = detect_units(extracted_text, entity_name, entity_unit_map, allowed_units)

                if result == None:
                    if (group_id, entity_name) in freq:
                        result.append({'index': row['index'], 'prediction': freq[(group_id, entity_name)]})
                    else:
                        results.append({'index': row['index'], 'prediction': result})       
                else:
                    results.append({'index': row['index'], 'prediction': result})
                    if not (group_id, entity_name) in freq:
                        freq[(group_id, entity_name)] = result

                os.system('cls' if os.system == 'nt' else 'clear')
            else:
                results.append({'index': row['index'], 'prediction': ''})
    
    return pd.DataFrame(results)

* Run

In [18]:
batch_size = 64
results_df = batch_process(df,batch_size)
results_df.to_csv('first_output.csv')

  9%|▉         | 61/684 [32:38<9:19:42, 53.90s/it]