In [6]:
! pip install -r requirements.txt



In [7]:
import os

# Retrieve secrets from Colab
groq_api_key = os.getenv('GROQ_API_KEY')

In [8]:
# Install the required libraries from requirements.txt
! pip install transformers torch requests Pillow fastapi uvicorn streamlit python-multipart langchain langchain-groq python-dotenv paddleocr PaddlePaddle==2.5.1




In [16]:
import os
import csv
import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from paddleocr import PaddleOCR
from PIL import Image, ImageEnhance
import time
from tqdm import tqdm
import hashlib
import pickle
from collections import Counter

# Initialize PaddleOCR model
ocr_model = PaddleOCR(use_angle_cls=True, lang='en')

# Caching setup
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)

# Counters for tracking unique entries
unique_images = set()
unique_entity_text_pairs = set()

def get_cache_key(data):
    return hashlib.md5(str(data).encode()).hexdigest()

def cache_result(key, data):
    with open(os.path.join(CACHE_DIR, key), 'wb') as f:
        pickle.dump(data, f)

def get_cached_result(key):
    try:
        with open(os.path.join(CACHE_DIR, key), 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        return None

def extract_text_with_paddle(image_url):
    cache_key = get_cache_key(image_url)
    cached_result = get_cached_result(cache_key)
    if cached_result:
        return cached_result

    unique_images.add(image_url)

    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            with open("temp_image.jpg", "wb") as f:
                f.write(response.content)
            with Image.open("temp_image.jpg") as img:
                enhancer = ImageEnhance.Contrast(img)
                img = enhancer.enhance(2.0)
                img.save("preprocessed_image.jpg")
            result = ocr_model.ocr("preprocessed_image.jpg")
            extracted_text = "\n".join([line[-1][0] for line in result[0]])
            extracted_text = extracted_text.strip()
            cache_result(cache_key, extracted_text)
            return extracted_text
        else:
            return ""
    except Exception as e:
        print(f"Error processing image {image_url}: {str(e)}")
        return ""

llm = ChatGroq(
    groq_api_key=os.environ.get("GROQ_API_KEY"),
    verbose=True,
    model_name="llama3-70b-8192",
    temperature=0.1,
    max_tokens=4000
)

entity_unit_map = {
    'width': {'cm', 'centimetre', 'centimeter', 'm', 'metre', 'meter', 'mm', 'millimetre', 'millimeter', 'in', 'inch', 'ft', 'foot', 'yd', 'yard'},
    'depth': {'cm', 'centimetre', 'centimeter', 'm', 'metre', 'meter', 'mm', 'millimetre', 'millimeter', 'in', 'inch', 'ft', 'foot', 'yd', 'yard'},
    'height': {'cm', 'centimetre', 'centimeter', 'm', 'metre', 'meter', 'mm', 'millimetre', 'millimeter', 'in', 'inch', 'ft', 'foot', 'yd', 'yard'},
    'item_weight': {'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'mg', 'milligram', 'milligramme', 'oz', 'ounce', 'lb', 'pound', 't', 'ton', 'tonne'},
    'maximum_weight_recommendation': {'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'mg', 'milligram', 'milligramme', 'oz', 'ounce', 'lb', 'pound', 't', 'ton', 'tonne'},
    'voltage': {'v', 'volt', 'kv', 'kilovolt', 'mv', 'millivolt'},
    'wattage': {'w', 'watt', 'kw', 'kilowatt'},
    'item_volume': {'ml', 'milliliter', 'millilitre', 'l', 'liter', 'litre', 'cl', 'centiliter', 'centilitre', 'fl oz', 'fluid ounce', 'gal', 'gallon', 'pt', 'pint', 'qt', 'quart', 'cup'}
}

prompt_template = PromptTemplate(
    input_variables=["entity_name", "units", "extracted_text"],
    template="""
You are an AI assistant specialized in extracting specific entity information from product descriptions and images. Your task is to accurately identify and extract quantity and unit information for given entities, even when the text is in different languages or the exact entity name isn't present. Pay special attention to numbers and units, regardless of the surrounding text's language.

Entity to extract: {entity_name}
Possible units: {units}
Extracted text from image:
{extracted_text}

Instructions:
1. Analyze the extracted text thoroughly for any mention of quantities and units related to the specified entity, regardless of the language.
2. Look for numbers adjacent to or associated with any of the possible units or their variations (singular/plural, abbreviations) in any language.
3. Consider context clues that might indicate the entity without explicitly naming it (e.g., "poids" for item_weight in French).
4. If multiple matches are found, select the most likely or prominent one based on context and typical product descriptions.
5. Return the result in the following JSON format:
   {{
     "value": number,
     "unit": "unit"
   }}
   Where:
   - 'number' is the extracted numeric value (float)
   - 'unit' is the extracted unit as a string
6. If no clear match is found, assign the most relevant (random if need be) value and unit based on your analysis and other content of the text, like if units are given in the text but they are not written together:
    {{
     "value": number,
     "unit": "unit"
   }}

   for example:
    2.63
6.68cm
91.44cm-199.39cm
36in78n

in this there is no unit written near 2.63 but you can make an inference that in has been mentioned below or like 6.68 cm is same as 2.63 in, so make intelligent guesses too
7. Avoid giving None as output
Provide only the JSON object as your response, without any additional text or explanations.
"""
)

chain = LLMChain(llm=llm, prompt=prompt_template)

def extract_entity_info(entity_name: str, extracted_text: str) -> dict:
    cache_key = get_cache_key(f"{entity_name}_{extracted_text}")
    cached_result = get_cached_result(cache_key)
    if cached_result:
        return cached_result

    unique_entity_text_pairs.add((entity_name, extracted_text))

    if entity_name not in entity_unit_map or not extracted_text:
        return {"value": None, "unit": None}

    units = ", ".join(entity_unit_map[entity_name])

    try:
        result = chain.run(entity_name=entity_name, units=units, extracted_text=extracted_text)
        parsed_result = json.loads(result)

        if isinstance(parsed_result, list) and len(parsed_result) > 0:
            for obj in parsed_result:
                if "value" in obj and "unit" in obj:
                    cache_result(cache_key, {"value": obj["value"], "unit": obj["unit"]})
                    return {"value": obj["value"], "unit": obj["unit"]}

        if isinstance(parsed_result, dict):
            cache_result(cache_key, {"value": parsed_result.get("value"), "unit": parsed_result.get("unit")})
            return {"value": parsed_result.get("value"), "unit": parsed_result.get("unit")}

    except json.JSONDecodeError:
        print(f"Error decoding JSON for entity {entity_name}")
    except Exception as e:
        print(f"Error processing entity {entity_name}: {str(e)}")

    return {"value": None, "unit": None}

def format_prediction(result: dict) -> str:
    if result["value"] is None or result["unit"] is None or result["value"] == "None":
        return "NaN"
    return f"{result['value']} {result['unit']}"

def process_single_entry(row):
    image_url = row['image_link']
    entity_name = row['entity_name']
    extracted_text = extract_text_with_paddle(image_url)
    result = extract_entity_info(entity_name, extracted_text)
    prediction = format_prediction(result)
    return prediction

def process_batch(batch):
    return [process_single_entry(row) for row in batch]

def process_csv_parallel(input_csv: str, output_csv: str, batch_size: int = 100, num_workers: int = 10):
    with open(input_csv, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)

    batches = [rows[i:i + batch_size] for i in range(0, len(rows), batch_size)]
    
    predictions = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}
        for future in tqdm(as_completed(future_to_batch), total=len(batches), desc="Processing batches"):
            predictions.extend(future.result())

    with open(output_csv, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['index', 'prediction'])
        for index, prediction in enumerate(predictions):
            writer.writerow([index, prediction])

    return predictions

if __name__ == "__main__":
    input_csv = "./dataset/test.csv"
    output_csv = "test_out.csv"
    batch_size = 100
    num_workers = 20  # Adjust based on your system's capabilities
    
    start_time = time.time()
    process_csv_parallel(input_csv, output_csv, batch_size, num_workers)
    end_time = time.time()
    
    print(f"Total processing time: {end_time - start_time:.2f} seconds")
    print(f"Number of unique images processed: {len(unique_images)}")
    print(f"Number of unique entity-text pairs processed: {len(unique_entity_text_pairs)}")
    
    # Calculate and display cache hit rates
    total_entries = len(rows)
    image_cache_hit_rate = (total_entries - len(unique_images)) / total_entries * 100
    entity_text_cache_hit_rate = (total_entries - len(unique_entity_text_pairs)) / total_entries * 100
    
    print(f"Image cache hit rate: {image_cache_hit_rate:.2f}%")
    print(f"Entity-text pair cache hit rate: {entity_text_cache_hit_rate:.2f}%")

[2024/09/15 20:25:01] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/adityaastronomy/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/adityaastronomy/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batc

Processing batches:   0%|          | 0/1312 [00:00<?, ?it/s]

Error processing image https://m.media-amazon.com/images/I/31N12Io5QzL.jpg: cannot identify image file '/Users/adityaastronomy/Downloads/student_resource 3/temp_image.jpg'
Error processing image https://m.media-amazon.com/images/I/31S1c-vaQpL.jpg: cannot identify image file '/Users/adityaastronomy/Downloads/student_resource 3/temp_image.jpg'
Error processing image https://m.media-amazon.com/images/I/41+jUzyRl5L.jpg: image file is truncated (0 bytes not processed)
[2024/09/15 20:25:12] ppocr DEBUG: dt_boxes num : 2, elapsed : 2.6852238178253174
[2024/09/15 20:25:12] ppocr DEBUG: dt_boxes num : 2, elapsed : 2.5767359733581543
[2024/09/15 20:25:12] ppocr DEBUG: cls num  : 2, elapsed : 0.20510601997375488
[2024/09/15 20:25:12] ppocr DEBUG: dt_boxes num : 10, elapsed : 5.287827014923096
[2024/09/15 20:25:12] ppocr DEBUG: cls num  : 2, elapsed : 0.07788586616516113
[2024/09/15 20:25:13] ppocr DEBUG: rec_res num  : 2, elapsed : 0.5067250728607178
[2024/09/15 20:25:13] ppocr DEBUG: dt_boxes nu

IOStream.flush timed out


Error processing image https://m.media-amazon.com/images/I/31NGa8b9Q0L.jpg: cannot identify image file '/Users/adityaastronomy/Downloads/student_resource 3/temp_image.jpg'
[2024/09/15 20:27:31] ppocr DEBUG: rec_res num  : 3, elapsed : 30.48556613922119
Error decoding JSON for entity depth
Error decoding JSON for entity height
[2024/09/15 20:27:43] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.9620442390441895
[2024/09/15 20:27:50] ppocr DEBUG: cls num  : 3, elapsed : 5.753436088562012
Error processing image https://m.media-amazon.com/images/I/41-RyOwkTmL.jpg: broken data stream when reading image file
Error decoding JSON for entity width


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


Error decoding JSON for entity height[2024/09/15 20:28:39] ppocr DEBUG: rec_res num  : 3, elapsed : 18.62928581237793

Error processing entity width: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01j7twv1rjfa2b6ktpvfh4c8cp` on tokens per minute (TPM): Limit 6000, Used 6626, Requested 499. Please try again in 11.253s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing entity item_weight: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01j7twv1rjfa2b6ktpvfh4c8cp` on tokens per minute (TPM): Limit 6000, Used 6097, Requested 511. Please try again in 6.08s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[2024/09/15 20:29:06] ppocr DEBUG: dt_boxes num : 5, elapsed : 14.634439945220947
Error processing entity h

IOStream.flush timed out


[2024/09/15 20:30:24] ppocr DEBUG: dt_boxes num : 10, elapsed : 76.47211813926697
[2024/09/15 20:30:25] ppocr DEBUG: dt_boxes num : 6, elapsed : 2.587583065032959
Error processing entity height: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01j7twv1rjfa2b6ktpvfh4c8cp` on tokens per minute (TPM): Limit 6000, Used 7858, Requested 502. Please try again in 23.605s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
[2024/09/15 20:30:31] ppocr DEBUG: rec_res num  : 5, elapsed : 7.024954080581665
Error processing image https://m.media-amazon.com/images/I/314Gwp04mYL.jpg: OpenCV(4.10.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/imgcodecs/src/loadsave.cpp:813: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error processing image https://m.media-amazon.com/images/I/31-bBqUQPuL.jpg: OpenCV(4.10.0) /U

In [None]:
predictions