## Imports

In [28]:
import os
import pandas as pd

## Loading Data

In [29]:
output = pd.read_csv('../sample_test_output.csv')

In [30]:
output.head(10)

Unnamed: 0,index,image_link,group_id,entity_name,output
0,0,https://m.media-amazon.com/images/I/41-NCxNuBx...,658003,width,20CM\n15CM
1,1,https://m.media-amazon.com/images/I/41-NCxNuBx...,658003,depth,20CM\n15CM
2,2,https://m.media-amazon.com/images/I/417NJrPEk+...,939426,maximum_weight_recommendation,Deodorizing module\nCat litter shovel\nAdsorb ...
3,3,https://m.media-amazon.com/images/I/417SThj+Sr...,276700,voltage,Professional tools\nBlade Diameter.\n305mm\nRa...
4,4,https://m.media-amazon.com/images/I/417SThj+Sr...,276700,wattage,Professional tools\nBlade Diameter.\n305mm\nRa...
5,5,https://m.media-amazon.com/images/I/41ADVPQgZO...,993359,item_weight,Calabrian Chili\nPowder\nWh52.91OZg
6,6,https://m.media-amazon.com/images/I/41nblnEkJ3...,648011,voltage,LED\nOSRAM\n5.5w=40w\n470 lm\nWarm\nWhite\nB22d
7,7,https://m.media-amazon.com/images/I/41nblnEkJ3...,648011,wattage,LED\nOSRAM\n5.5w=40w\n470 lm\nWarm\nWhite\nB22d
8,8,https://m.media-amazon.com/images/I/41o3iis9E7...,487566,height,Ideal Bed & Armchair Cover\nNeat size foldaway...
9,9,https://m.media-amazon.com/images/I/41pvwR9Gba...,965518,voltage,CAKLAR\n20V204\nLED WORK LIGHT for Dark Areas


## Output Preprocessing

In [31]:
import pandas as pd
import re

# mapping to standardize common unit abbreviations to full forms
unit_standardization = {
    'cm': 'centimetre',
    'mm': 'millimetre',
    'm': 'metre',
    'ft': 'foot',
    'in': 'inch',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'mg': 'milligram',
    'µg': 'microgram',
    'oz': 'ounce',
    'lb': 'pound',
    't': 'ton',
    'ml': 'millilitre',
    'l': 'litre',
    'kv': 'kilovolt',
    'mv': 'millivolt',
    'v': 'volt',
    'kw': 'kilowatt',
    'w': 'watt'
}

# entity to unit mapping
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Function to standardize units in the text
def standardize_units(text):
    text = text.lower()  # Normalize case
    for short_form, full_form in unit_standardization.items():
        text = re.sub(rf'\b{short_form}\b', full_form, text)
    return text

# Function to preprocess text by adding spaces between numbers and units and removing \n
def preprocess_text(text):
    text = text.replace('\n', ' ') 
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text) 
    text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', text) 
    text = re.sub(r'\s+', ' ', text).strip() 
    return text

# Function to extract numerical values and units from text
def extract_values_and_units(text):
    pattern = r'\b(\d+(\.\d+)?)\s*([a-zA-Z]+)?\b'
    matches = re.findall(pattern, text)
    values = [match[0] for match in matches]
    units = [match[2] for match in matches if match[2]]
    return values, units

# Function to assign default units based on entity name if no unit is present
def assign_default_units(row):
    if not row['units']:  # If the units list is empty
        default_unit = next(iter(entity_unit_map.get(row['entity_name'], [])), '')
        row['units'] = [default_unit] if default_unit else []
    return row

# Main function to process the DataFrame
def process_output(output):
    output['output'] = output['output'].apply(standardize_units)

    output['processed_output'] = output['output'].apply(preprocess_text)

    output['numerical_values'], output['units'] = zip(*output['processed_output'].apply(extract_values_and_units))

    output = output.apply(assign_default_units, axis=1)

    return output


In [32]:
output = process_output(output)

In [36]:
output.sample(5)

Unnamed: 0,index,image_link,group_id,entity_name,output,processed_output,numerical_values,units
26,26,https://m.media-amazon.com/images/I/51BEuVR4Zz...,695925,width,70\n110.\ncentimetre,70 110. centimetre,"[70, 110]",[centimetre]
24,24,https://m.media-amazon.com/images/I/514pScQdlC...,997176,voltage,12v car heating cup\npowered by dc 12v cigaret...,12 v car heating cup powered by dc 12 v cigare...,"[12, 12, 12]","[v, v, v]"
77,77,https://m.media-amazon.com/images/I/71eCfiIG-A...,275506,item_weight,sports\nheadphones\nyou can'ton shakeit off wh...,sports headphones you can'ton shakeit off when...,"[4.7, 5.3, 90, 24, 480, 1]","[g, sync, h, h, h]"
47,47,https://m.media-amazon.com/images/I/51oaOP8qJl...,140266,depth,148mm\n55.6mm,148 mm 55.6 mm,"[148, 55.6]","[mm, mm]"
49,49,https://m.media-amazon.com/images/I/51r7U52rh7...,860821,wattage,energyinformation\nat high speed\nairflow\nele...,energyinformation at high speed airflow electr...,"[4, 400, 45, 98, 36, 48, 71, 86]","[cubic, ceiling, to, cubic]"
