In [7]:
!pip install pytesseract pillow requests opencv-python pandas

Defaulting to user installation because normal site-packages is not writeable


In [8]:
import requests
from PIL import Image
import pytesseract
import cv2
import os
import re
import pandas as pd
from io import BytesIO

# Download image helper
def download_image(url, save_path):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img.save(save_path)
            return save_path
    except Exception as e:
        print(f"Error downloading image {url}: {e}")
    return None

# Directory to store downloaded images
download_dir = "images/"
os.makedirs(download_dir, exist_ok=True)

In [9]:
def extract_text_from_image(image_path):
    try:
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
        text = pytesseract.image_to_string(gray)  # Extract text using OCR
        return text
    except Exception as e:
        print(f"Error extracting text from {image_path}: {e}")
        return ""

In [13]:
entity_unit_map = {
    'width': ['centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'],
    'depth': ['centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'],
    'height': ['centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'],
    'item_weight': ['gram','kilogram','microgram', 'milligram','ounce',
        'pound',
        'ton'],
    'maximum_weight_recommendation': ['gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'],
    'voltage': ['kilovolt', 'millivolt', 'volt'],
    'wattage': ['kilowatt', 'watt'],
    'item_volume': ['centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart']
}

ALLOWED_UNITS = [unit for entity in entity_unit_map for unit in entity_unit_map[entity]]

In [14]:
def extract_entity_value(text, entity_name):
    # Define regex for extracting number and unit
    number_pattern = r"(\d*\.?\d+)"
    unit_pattern = "|".join(ALLOWED_UNITS[entity_name])
    
    pattern = re.compile(f"({number_pattern})\s*({unit_pattern})", re.IGNORECASE)
    match = pattern.search(text)
    
    if match:
        number = match.group(1)
        unit = match.group(2).lower()
        return f"{float(number)} {unit}"
    
    return ""  # If no valid match is found

In [18]:
pytesseract.pytesseract.tesseract_cmd = r'C:\users\hp\appdata\roaming\python\python39\site-packages\tesseract.exe'

In [19]:
# Assuming you have train.csv and test.csv
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Initialize lists for predictions
predictions = []

# Loop through each test image
for idx, row in test_df.iterrows():
    image_url = row['image_link']
    entity_name = row['entity_name']
    
    # Download image
    image_path = os.path.join(download_dir, f"{idx}.jpg")
    downloaded_image = download_image(image_url, image_path)
    
    if downloaded_image:
        # Extract text from image
        extracted_text = extract_text_from_image(downloaded_image)
        
        # Extract the relevant entity value
        entity_value = extract_entity_value(extracted_text, entity_name)
        
        # Store the prediction (if no value found, entity_value will be empty)
        predictions.append([row['index'], entity_value])
    else:
        # If image download fails, add an empty prediction
        predictions.append([row['index'], ""])

# Convert predictions to DataFrame and save to CSV
predictions_df = pd.DataFrame(predictions, columns=["index", "prediction"])
predictions_df.to_csv("submission.csv", index=False)


Error extracting text from images/0.jpg: C:\users\hp\appdata\roaming\python\python39\site-packages\tesseract.exe is not installed or it's not in your PATH. See README file for more information.


TypeError: 'set' object is not subscriptable

In [16]:
!python sanity.py 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
usage: sanity.py [-h] --test_filename TEST_FILENAME --output_filename
                 OUTPUT_FILENAME
sanity.py: error: the following arguments are required: --test_filename, --output_filename
