In [None]:
import pandas as pd
import re

# Constants from constants.py
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce',
                    'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for units in entity_unit_map.values() for unit in units}

# Comprehensive mapping of unit variations to standardized units
unit_mapping = {
    # Length units
    'cm': 'centimetre', 'centimeter': 'centimetre', 'centimeters': 'centimetre',
    'ft': 'foot', 'feet': 'foot', "'": 'foot',
    'in': 'inch', 'inches': 'inch', '"': 'inch',
    'm': 'metre', 'meter': 'metre', 'meters': 'metre',
    'mm': 'millimetre', 'millimeter': 'millimetre', 'millimeters': 'millimetre',
    'yd': 'yard', 'yds': 'yard', 'yards': 'yard',

    # Weight units
    'g': 'gram', 'gm': 'gram', 'gr': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kgs': 'kilogram', 'kilograms': 'kilogram', 'kilo': 'kilogram', 'kilos': 'kilogram',
    'mcg': 'microgram', 'micrograms': 'microgram', 'µg': 'microgram',
    'mg': 'milligram', 'mgs': 'milligram', 'milligrams': 'milligram',
    'oz': 'ounce', 'ounces': 'ounce',
    'lb': 'pound', 'lbs': 'pound', 'pounds': 'pound',
    't': 'ton', 'tons': 'ton',

    # Voltage units
    'kv': 'kilovolt', 'kilovolts': 'kilovolt',
    'mv': 'millivolt', 'millivolts': 'millivolt',
    'v': 'volt', 'volts': 'volt',

    # Wattage units
    'kw': 'kilowatt', 'kilowatts': 'kilowatt',
    'w': 'watt', 'watts': 'watt',

    # Volume units
    'cl': 'centilitre', 'centiliter': 'centilitre', 'centiliters': 'centilitre',
    'cu ft': 'cubic foot', 'cubic feet': 'cubic foot', 'ft³': 'cubic foot',
    'cu in': 'cubic inch', 'cubic inches': 'cubic inch', 'in³': 'cubic inch',
    'cup': 'cup', 'cups': 'cup',
    'dl': 'decilitre', 'deciliter': 'decilitre', 'deciliters': 'decilitre',
    'fl oz': 'fluid ounce', 'fluid ounces': 'fluid ounce', 'fl. oz.': 'fluid ounce',
    'gal': 'gallon', 'gallons': 'gallon',
    'imp gal': 'imperial gallon', 'imperial gallons': 'imperial gallon',
    'l': 'litre', 'liter': 'litre', 'liters': 'litre',
    'µl': 'microlitre', 'microliter': 'microlitre', 'microliters': 'microlitre',
    'ml': 'millilitre', 'milliliter': 'millilitre', 'milliliters': 'millilitre',
    'pt': 'pint', 'pints': 'pint',
    'qt': 'quart', 'quarts': 'quart'
}

def standardize_unit(value):
    if pd.isna(value) or not isinstance(value, str):
        return value

    # Regular expression to match number and unit, handling various formats
    match = re.match(r'(\d+(?:\.\d+)?)\s*([a-zA-Z³µ]+\.?\s*[a-zA-Z³µ]*\.?)', value.strip())
    if not match:
        return value

    number, unit = match.groups()
    try:
        number = float(number)
    except ValueError:
        return value

    unit = unit.strip().lower()
    if unit.endswith('s') and unit[:-1] in unit_mapping:
        unit = unit[:-1]

    standardized_unit = unit_mapping.get(unit, unit)

    if standardized_unit in allowed_units:
        return f"{number:.2f} {standardized_unit}"
    else:
        return value

# Assuming your DataFrame is named 'df' and the column with extracted text is named 'extracted_text'
# df['standardized_value'] = df['extracted_text'].apply(standardize_unit)

# Example usage:
df = pd.read_csv('/content/complete_outputs (1).csv')
df['standardized_value'] = df['extracted_value'].apply(standardize_unit)
print(df)

    Unnamed: 0                                         image_link  group_id  \
0           20  https://m.media-amazon.com/images/I/31EvJszFVf...    731432   
1           70  https://m.media-amazon.com/images/I/413FQB0ZML...    308856   
2           71  https://m.media-amazon.com/images/I/41EjbFu-+y...    308856   
3           46  https://m.media-amazon.com/images/I/41wvffSxB4...    299791   
4           26  https://m.media-amazon.com/images/I/51WsuKKAVr...    866516   
..         ...                                                ...       ...   
94          57  https://m.media-amazon.com/images/I/915w0BdW-g...    993359   
95           8  https://m.media-amazon.com/images/I/91Cma3Rzse...    731432   
96          12  https://m.media-amazon.com/images/I/91LPf6OjV9...    281678   
97          47  https://m.media-amazon.com/images/I/91cErO-KbL...    237000   
98          19  https://m.media-amazon.com/images/I/91prZeizZn...    731432   

    entity_name    entity_value         tags  \
0  

In [None]:
df.to_csv('new_file.csv')

In [None]:
import pandas as pd
import re

def extract_value_and_unit(value):
    if pd.isna(value):
        return None, None
    match = re.match(r'([\d.]+)\s*([a-zA-Z]+)', str(value))
    if match:
        number, unit = match.groups()
        return round(float(number), 2), unit.lower()
    return None, None

def are_values_equal(val1, val2):
    if val1 is None or val2 is None:
        return False
    num1, unit1 = val1
    num2, unit2 = val2
    return num1 == num2 and unit1 == unit2

def calculate_f1_score(df):
    df['actual'] = df['entity_value'].apply(extract_value_and_unit)
    df['predicted'] = df['standardized_value'].apply(extract_value_and_unit)

    true_positives = sum(df.apply(lambda row: are_values_equal(row['actual'], row['predicted']), axis=1))
    total_predictions = sum(df['predicted'].notna())
    total_actuals = sum(df['actual'].notna())

    false_positives = total_predictions - true_positives
    false_negatives = total_actuals - true_positives

    precision = true_positives / total_predictions if total_predictions > 0 else 0
    recall = true_positives / total_actuals if total_actuals > 0 else 0

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1_score, precision, recall, true_positives, false_positives, false_negatives

# Assuming your DataFrame is named 'df'
f1_score, precision, recall, tp, fp, fn = calculate_f1_score(df)

print(f"F1 Score: {f1_score:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")

# To see the comparison
print(df[['entity_value', 'standardized_value', 'actual', 'predicted']])

F1 Score: 0.4646
Precision: 0.4646
Recall: 0.4646
True Positives: 46
False Positives: 53
False Negatives: 53
      entity_value standardized_value               actual  \
0         200 gram                NaN        (200.0, gram)   
1       2 kilogram      2.00 kilogram      (2.0, kilogram)   
2      10 kilogram                NaN     (10.0, kilogram)   
3        15.5 gram                NaN         (15.5, gram)   
4       158.0 gram        158.00 gram        (158.0, gram)   
..             ...                ...                  ...   
94        500 gram        500.00 gram        (500.0, gram)   
95  1400 milligram  1400.00 milligram  (1400.0, milligram)   
96      3.53 ounce                NaN        (3.53, ounce)   
97      200.0 gram                NaN        (200.0, gram)   
98        200 gram                NaN        (200.0, gram)   

              predicted  
0          (None, None)  
1       (2.0, kilogram)  
2          (None, None)  
3          (None, None)  
4         (158.0