In [1]:
import pandas as pd

In [None]:
import os
import pandas as pd
from paddleocr import PaddleOCR
from tqdm import tqdm  # Import tqdm for progress bar

# Initialize OCR model
ocr_model = PaddleOCR(lang='en')



In [8]:
def get_text(image_path):
    try:
        # Check if the image file exists
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"File {image_path} not found.")
        
        # Perform OCR on the image
        result = ocr_model.ocr(image_path)
        boxes = []
        scores = []
        text = []
        
        for res in result:
            for p in res:
                boxes.append(p[0])
                scores.append(p[-1][-1])
                text.append(p[-1][0])

        # Concatenate the text results
        context = ' '.join(text)

        # Check if the context is empty
        if not context.strip():  # If context is empty or just whitespace
            return 'Empty context'

        return context

    except FileNotFoundError as fnf_error:
        print(fnf_error)
        return 'File Not Found'
    
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return 'Error in OCR'

# Define a function to apply the text extraction to a row in the DataFrame



In [None]:
try:
    # Read the CSV file
    test_csv = pd.read_csv('final_test.csv')

    # Apply the OCR function to each image name in the DataFrame using tqdm for progress bar
    tqdm.pandas()  # Initialize tqdm for pandas

    test_csv['context'] = test_csv['Image_names'].progress_apply(lambda x: get_text(os.path.join('test_images', x)))

    # Save the updated DataFrame to a new CSV file
    test_csv.to_csv('Output.csv', index=False)

except FileNotFoundError as fnf_error:
    print(f"CSV file not found: {fnf_error}")

except Exception as e:
    print(f"Error processing DataFrame: {e}")

In [1]:
import ollama

# Define the reusable prompt template with placeholders for parameters and paragraph
prompt_template = '''You are a text extractor tasked with identifying specific numerical information from a given paragraph. The information you need to extract pertains to a given parameter.

Example:
Given the parameter item_weight and the input paragraph:

"Tiens TEAPOLY CAPSULESNUTRACEUTICA 14 ksa 10017011004489 Net Weight:0.41g x 60capsules"
Your task:
Extract and output only the relevant numerical value with its units.
For this example, you should output in this format {0.41 g} and strictly do not print anything else not even your analysis, i just need numerical value and its unit. if you do not know the answer output empty string.
Parameters to consider:

item_weight
item_volume
voltage
wattage
maximum_weight_recommendation
height
depth
width
Note: The parameter to extract will be provided as a separate argument.
'''

# Function to update the parameters and paragraph and call the model
def extract_text(paragraph, parameter):
    # Fill in the parameter and paragraph in the prompt template
    prompt = prompt_template + f"\nParameter: {parameter}\nParagraph: {paragraph}"
    
    # Call the phi3.5 model via Ollama API
    response = ollama.chat(model='llama3', messages=[
        {
            'role': 'user',
            'content': prompt,
        },
    ])
    
    # Extract the response message content
    message = response['message']['content']
    # print(message)
    
    parts = message.split('{')
    if len(parts) > 1:
        result = parts[1].split('}')[0]
        return  result
    else:
        return ""
    # return message




In [2]:
parameters_1 = ['item_weight']
paragraph_1 = "P&BPET&BIO N Nutri+gen Basic High Quality All Natural Excellent Formula GMO/Fat/Chemica free HET WEOHT120.5g4"

parameters_2 = ['watt']
paragraph_2 = "ABC Electronic Item Voltage: 220V, Power: 1000W, Weight: 1.5kg"

parameters_3 = ['width']
paragraph_3 = '30 cm 30cm 15cm'
extracted_data_3 = extract_text(paragraph_3,parameters_3)
print(extracted_data_3)

extracted_data_1 = extract_text(paragraph_1, parameters_1)
print(extracted_data_1)

extracted_data_2 = extract_text(paragraph_2, parameters_2)


print(extracted_data_2)



30 cm
120.5 g
1000 W


In [None]:
import pandas as pd 

df2 = pd.read_csv('Output.csv')

In [None]:
def apply_extract_text(row):
    try:
        return extract_text(row['context'], row['entity_name'])
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        return None
batch = 1000
# Apply the function to each row and store the results in a new column
for i in range(batch, df.shape[0], batch):
    df.loc[i : i + batch, 'target'] = df.iloc[i:i+batch, :].apply(lambda row: apply_extract_text(row), axis=1)
    print(f"{i + batch} entries processed!!")
    # break

In [None]:
import re

# Define the allowed units for each entity
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Define a dictionary for unit conversions
unit_conversion = {
    'g': 'gram',
    'gram': 'gram',
    'kg': 'kilogram',
    'kilogram': 'kilogram',
    'mg': 'milligram',
    'milligram': 'milligram',
    'mcg': 'microgram',
    'microgram': 'microgram',
    'oz': 'ounce',
    'ounce': 'ounce',
    'lb': 'pound',
    'pound': 'pound',
    'ton': 'ton',
    'v': 'volt',
    'volt': 'volt',
    'w': 'watt',
    'watt': 'watt',
    'cm': 'centimetre',
    'centimetre': 'centimetre',
    'in': 'inch',
    'inch': 'inch',
    'km': 'kilometre',
    'kilometre': 'kilometre',
    'ml': 'millilitre',
    'millilitre': 'millilitre',
    'l': 'litre',
    'litre': 'litre',
    'kv': 'kilovolt',
    'kilovolt': 'kilovolt',
    'mv': 'millivolt',
    'millivolt': 'millivolt',
    'kw': 'kilowatt',
    'kilowatt': 'kilowatt',
    'ft': 'foot',
    'foot': 'foot',
    'm': 'metre',
    'metre': 'metre',
    'mm': 'millimetre',
    'millimetre': 'millimetre',
    'yd': 'yard',
    'yard': 'yard',
    'cl': 'centilitre',
    'centilitre': 'centilitre',
    'fl oz': 'fluid ounce',
    'fluid ounce': 'fluid ounce',
    'gal': 'gallon',
    'gallon': 'gallon',
    'imp gal': 'imperial gallon',
    'imperial gallon': 'imperial gallon',
    'pint': 'pint',
    'quart': 'quart',
    'cu ft': 'cubic foot',
    'cubic foot': 'cubic foot',
    'cu in': 'cubic inch',
    'cubic inch': 'cubic inch',
    'cup': 'cup'
}

# Regular expression to match numbers and units, ensuring they comply with the format
pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')

def convert_unit(input_str, entity):
    # Regular expression to match numbers and units, including spaces between unit words
    match = re.match(r"([+-]?\d{1,3}(?:,\d{3})*(?:\.\d*)?)\s*([a-zA-Z]+)", input_str.strip())
    
    if not match:
        return "Unit mismatch"
    
    value = float(match.group(1).replace(',', ''))  
    unit = match.group(2) 
    # value = float(value)
    # print(value, _, unit)
    # Normalize the unit to its full form
    unit_full = unit_conversion.get(unit.lower().strip(), None)
    
    if not unit_full:
        return "Unit not full"
    
    # Check if the unit is valid for the given entity
    allowed_units = entity_unit_map.get(entity, None)
    
    if not allowed_units:
        return "Unit not allowed"
    
    if unit_full not in allowed_units:
        return "Unit not allowed"
    
    # Return the result matching the specified pattern
    result = f"{value} {unit_full}"
    
    # Validate the result against the pattern
    if pattern.match(result):
        return result
    else:
        return "Pattern Not matched"

# Example usage
print(convert_unit("5.5 kg", "item_weight"))  # Output: 5.5 kilogram
print(convert_unit("10 V", "voltage"))        # Output: 10 volt
print(convert_unit("20 W", "wattage"))        # Output: 20 watt
print(convert_unit("15 cm", "width"))         # Output: 15 centimetre
print(convert_unit("42 cm / 16.54", "height")) # Output: Invalid input format
print(convert_unit("500 ml", "item_volume"))  # Output: 500 millilitre
print(convert_unit("15 km", "depth"))          # Output: Unit 'kilometre' not allowed for entity 'depth'


In [None]:
df2['prediction'] = df2.apply(lambda x: convert_unit(x['target'], x['entity_name']), axis=1)

In [None]:
df2['prediction'] = df2['prediction'].apply(lambda x: '' if x.split(' ')[0] == '0' else x)

In [None]:
df2 = df2.drop(columns=['context', 'target','entity_value','Image_names','group_id','image_link'])


In [None]:
df2.drop(columns=['entity_name'],inplace=True)

In [None]:
df2.to_csv("finaltry.csv",index=False)