In [29]:
import pandas as pd
adrian = pd.read_csv('label_text_Adrian.csv', header = None)
adrian

Unnamed: 0,0
0,Boneless Lamb Product of Australia L Leg-Chump...
1,Keep Frozen Product of Australia Beef Burger B...
2,Premium Cooked Frozen Octopus Tentacles Sous V...
3,Frozen Ocean Trout Fillet Skin On Individually...
4,Gourmet Beef Burger Patties Part Cooked 120g T...
5,Tassal Tasmanian Salmon Product of Australia T...
6,SL SFRC Date 4/07/24 Packed Snapper FLTS Red C...
7,Frozen Thigh Fillet N/B-Diced 94234244564 Prod...
8,(30PCS/BX) Frozen Refrigeration: Keep Frozen(S...
9,SEFSSFFF Packed Date 25/06/24 Snapper FLTS Sad...


In [30]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
modelname = 'bert-large-uncased-whole-word-masking-finetuned-squad'
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForQuestionAnswering.from_pretrained(modelname)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
from datetime import datetime, timedelta
import re
questions = [
    "What is the name of the food?",
    "What is the last date ?",
    "What is the weight of the food ?"
]

# Function to compute the best before date from packed date
def compute_best_before_date(packed_date, months):
    packed_date_obj = datetime.strptime(packed_date, "%d/%m/%y")  # Handle two-digit year
    best_before_date_obj = packed_date_obj + timedelta(days=months*30)  # Approximate month length as 30 days
    return best_before_date_obj.strftime("%d/%m/%Y")


# Function to parse dates in DDMMYY format
def parse_ddmmyy(date_str):
    return datetime.strptime(date_str, "%d%m%y").strftime("%d/%m/%Y")

# Function to get answer for a single question
def get_answer(question, context):
    # Prepare the inputs
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    input_ids = inputs["input_ids"].tolist()[0]

    # Get model outputs
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Get the most likely beginning and end of answer span
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Decode the answer
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    # Validate and clean the answer
    if "weight" in question.lower():
        match = re.search(r'\b\d+\.?\d*\s*KG\b', context, re.IGNORECASE)
        if match:
            return match.group(0).strip()  # Return the numerical part including "KG"
        else:
            return "Weight information not found."
    
    elif any(term in question.lower() for term in ["expiry date", "use by date", "best before date", "expiry"]):
        # Check for relative date format first
        packed_date_match = re.search(r'Packed Date (\d{1,2}/\d{1,2}/\d{2,4})', context)
        best_before_match = re.search(r'Best Before (\d+) Months from Packed Date', context, re.IGNORECASE)
        if packed_date_match and best_before_match:
            packed_date = packed_date_match.group(1)
            months = int(best_before_match.group(1))
            best_before_date = compute_best_before_date(packed_date, months)
            return best_before_date
        else:
            # Check for DDMMYY format
            date_match_ddmmyy = re.search(r'Best Before:\s*(\d{6})', context, re.IGNORECASE)
            if date_match_ddmmyy:
                best_before_date = parse_ddmmyy(date_match_ddmmyy.group(1))
                return best_before_date
            
            # Check for standard date format
            date_match = re.search(r'(Use By Date|Best Before|Expiry Date):?\s*(\d{2}/\d{2}/\d{4})', context, re.IGNORECASE)
            if date_match:
                return date_match.group(2)  # Return the extracted date
            else:
                return "There's no expiry date."
    
    elif "food" in question.lower() or "product" in question.lower():
        # Extract the product name or food description
        match = re.search(r'Packed Date \d{1,2}/\d{1,2}/\d{2,4} (.+?) (?:\d+\s*KG|Keep Frozen|Product of Australia)', context, re.IGNORECASE)
        if match:
            product_description = match.group(1).strip()
            # Remove any trailing "Product of Australia" if present
            product_description = re.sub(r'\bProduct of Australia\b', '', product_description, flags=re.IGNORECASE).strip()
            return product_description

    return answer.strip()

In [62]:
context = adrian[0][1]

for question in questions:
    answer = get_answer(question, context)
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: What is the name of the food?
Answer: beef burger

Question: What is the last date ?
Answer: 14 may 2025

Question: What is the weight of the food ?
Answer: 13.63 kg



In [64]:
adrian[0][1]

'Keep Frozen Product of Australia Beef Burger Bulk Pack Net Wt 13.63 kg 13.63 KG Pkg On 14 May 2024 Best Before 14 May 2025 Top Cut Foods Pty Ltd 101265 Boneless Beef NL Allergens'