In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [5]:
import csv

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [6]:
file_path = "label_data/label_text_Adrian.csv"
df = pd.read_csv(file_path, header = None)
# Dropping repeat label
df = df.drop(10).reset_index(drop=True)

In [7]:
import spacy

# Load pretrained models
weight_pipe= pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="average")
food_pipe = pipeline("ner", model="Dizex/InstaFoodRoBERTa-NER", aggregation_strategy="max")
date_pipe = pipeline("ner", model="mdarhri00/named-entity-recognition", aggregation_strategy="average")


In [8]:
def text_extraction(text):

    # Apply the pretrained models
    lab_value_ents = weight_pipe(text, aggregation_strategy="average")
    date_ents = date_pipe(text)
    food_ents = food_pipe(text)


    food_name = np.nan
    packaged_date = np.nan
    expiry_date = np.nan
    weight = np.nan


    # Getting Dates
    #print("DATE ENT LIST:")
    for ent in date_ents:
        if ent['entity_group'] == "date_time":
            #print(f"ent: {ent['word']}, class: {ent['entity_group']}")
            start = ent["start"]
            end = ent["end"]
            
            # Get the tokens before and after the DATE entity
            before_ent = text[start-20:start]
            after_ent = text[end:end+5]
            
            # Check the context around the DATE entity
            context = before_ent.lower() + " " + after_ent.lower()

            # Define the phrases to look for
            packaging_phrases = ["packed", "pkg on", "packaged on", 'production date']
            expiry_phrases = ["best before", "use by", "expiry date", "expires on", "expires by", 'expires', 'after']

            if any(phrase in context for phrase in expiry_phrases):
                expiry_date = ent["word"]
            elif any(phrase in context for phrase in packaging_phrases):
                packaged_date = ent["word"]


    # Getting Weight

    # print("")
    #print("WEIGHT ENT LIST:")
    #[print(f"ent: {ent['word']}, score: {ent['score']}, class: {ent['entity_group']}") for ent in lab_value_ents]
    weight_phrases = ['0g', '1g','2g','3g','4g','5g','6g','7g','8g', '9g', 'kg','grams','kilograms']
    banned_weight_phrases = ["-", "kg ", "g "]

    weight_entities = [ent for ent in lab_value_ents if any(phrase in ent['word'].lower() for phrase in weight_phrases) and not any(phrase in ent['word'].lower() for phrase in banned_weight_phrases)]
    #[print(f"ent: {ent['word']}, score: {ent['score']}") for ent in weight_entities]
    if weight_entities == []:
        pass
    else:
        max_score_entity = max(weight_entities, key=lambda entity: entity['score'])
        weight = max_score_entity["word"]


    # Getting Food Name

    # print("")
    # print("FOOD ENT LIST:")
    #[print(f"ent: {ent['word']}, score: {ent['score']}") for ent in food_ents]
    if food_ents == []:
        pass
    else:
        max_score_entity = max(food_ents, key=lambda entity: entity['score'])
        food_name = max_score_entity["word"]


    return expiry_date, packaged_date, weight, food_name

In [19]:
# Testing out the model on one example

text = df[0][1]

expiry_date, packaged_date, weight, food_name = text_extraction(text)

print("")
print(f"Food Name: {food_name}")
print(f"Weight: {weight}")
print(f"Packaged Date: {packaged_date}")
print(f"Expiry Date: {expiry_date}")
print("")
print(f"Example Text: {text}")


Food Name:  Australia Beef Burger Bulk Pack
Weight: 13. 63 kg
Packaged Date: 14 May 2024
Expiry Date: 14 May 2025

Example Text: Keep Frozen Product of Australia Beef Burger Bulk Pack Net Wt 13.63 kg 13.63 KG Pkg On 14 May 2024 Best Before 14 May 2025 Top Cut Foods Pty Ltd 101265 Boneless Beef NL Allergens


In [10]:
data = []

# Loop through each text in the dataframe
for text in df[0]:
    expiry_date, packaged_date, weight, food_name = text_extraction(text)
    data.append([food_name, weight, expiry_date])

# Create a new dataframe with the extracted data
extracted_df = pd.DataFrame(data, columns=["Product Name", "Weight", "Expiry Date"])

# Write the dataframe to a CSV file
extracted_df.to_csv('label_data/extracted_data.csv', index=False)

In [11]:
# Reading in the predicted Product Names, Weights, and Expiry Dates
file_path = "label_data/extracted_data.csv"
predicted_df = pd.read_csv(file_path)

In [12]:
# Loading in the correctly annotated data which the predictions will be measure against for accuracy
file_path = "label_data/annotated_adrian.csv"
annotated_df = pd.read_csv(file_path)

In [14]:
from dateutil import parser
from datetime import datetime

def convert_date(date_list):

    formatted_list = []

    # Parse the date using dateutil.parser
    for date_str in date_list:
        try:
            parsed_date = parser.parse(date_str)
            # Format the date to the desired format: e.g '23 Jul 2003'
            formatted_date = parsed_date.strftime('%d %b %Y')
            formatted_list.append(formatted_date)
        except TypeError:
            formatted_list.append(date_str)

        
    
    return formatted_list

In [15]:
predicted_df['Expiry Date'] = convert_date(predicted_df['Expiry Date'])

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import hamming

# Cosine Similarity Accuracy Measure

def text_similarity(text1, text2):
    """Calculate the cosine similarity between two texts using TF-IDF."""
    if pd.isna(text1) and pd.isna(text2):
        return 100  # Both are NaN, consider it 100% accurate
    elif pd.isna(text1) or pd.isna(text2):
        return 0  # One is NaN, consider it 0% accurate
    else:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
        return similarity[0][0] * 100
    
#Levenshtein Distance Accuracy Measure

def levenshtein_distance(s1, s2):
    """Calculate the Levenshtein Distance between two strings."""
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for index2, char2 in enumerate(s2):
        new_distances = [index2 + 1]
        for index1, char1 in enumerate(s1):
            cost = 0 if char1 == char2 else 1
            new_distances.append(min(
                distances[index1] + 1,
                new_distances[-1] + 1,
                distances[index1] + cost
            ))
        distances = new_distances
    return distances[-1]

def similarity_percentage(s1, s2):
    """Calculate the similarity percentage between two strings."""
    if s1 == s2:
        return 100.0
    elif pd.isna(s1) or pd.isna(s2):
        return 0  # One is NaN, consider it 0% accurate

    lev_distance = levenshtein_distance(s1, s2)
    max_len = max(len(s1), len(s2))
    similarity = (1 - lev_distance / max_len) * 100
    return similarity

In [17]:
def label_pred_acc(predicted_df, annotated_df, acc_measure = 'cosine'):
    
    food_acc = 0
    weight_acc = 0
    date_acc = 0
    nrows = predicted_df.shape[0]

    if acc_measure == 'cosine':
        for i in range(nrows):
                food_acc += text_similarity(predicted_df.iloc[i]['Product Name'], annotated_df.iloc[i]['Product Name'])
                weight_acc += text_similarity(predicted_df.iloc[i]['Weight'], annotated_df.iloc[i]['Weight'])
                date_acc += text_similarity(predicted_df.iloc[i]['Expiry Date'], annotated_df.iloc[i]['Expiry Date'])
    elif acc_measure == 'lev': 
         for i in range(nrows):
                food_acc += similarity_percentage(predicted_df.iloc[i]['Product Name'], annotated_df.iloc[i]['Product Name'])
                weight_acc += similarity_percentage(predicted_df.iloc[i]['Weight'], annotated_df.iloc[i]['Weight'])
                date_acc += similarity_percentage(predicted_df.iloc[i]['Expiry Date'], annotated_df.iloc[i]['Expiry Date'])
    else:
          raise ValueError("Please enter a valid Accuracy Measure")

    food_acc = food_acc/nrows
    weight_acc = weight_acc/nrows
    date_acc = date_acc/nrows

    return food_acc, weight_acc, date_acc


In [18]:
food_acc, weight_acc, date_acc = label_pred_acc(predicted_df, annotated_df)

print(f"Food Name Accuracy = {food_acc}")
print(f"Weight Accuracy = {weight_acc}")
print(f"Date Accuracy = {date_acc}")
print(f"Overall Accuracy = {(food_acc + weight_acc + date_acc)/3}")

Food Name Accuracy = 48.798345964974686
Weight Accuracy = 48.340334879649774
Date Accuracy = 62.404221307443024
Overall Accuracy = 53.180967384022495
