In [1]:
import requests

In [2]:
def query_openfoodfacts_and_extract(product_name, field_options):
    """
    Queries the OpenFoodFacts database for a product by name and extracts specified information based on function arguments.
    The field_options parameter specifies which fields to include in the result.
    """
    base_url = "https://world.openfoodfacts.org/cgi/search.pl"
    fields_to_query = []

    # Build the list of fields to query based on the field_options
    for field, include in field_options.items():
        if include:
            fields_to_query.append(field)

    # Prepare the parameters for the request
    params = {
        'search_terms': product_name,
        'action': 'process',
        'json': 1,
        'fields': ','.join(fields_to_query) if fields_to_query else 'product_name',  # Default to product_name if no fields are selected
    }

    # Add custom headers as recommended by the API usage guidelines
    headers = {
        'User-Agent': 'testapp - Version 1 - benedict.s@hotmail.co.uk'
    }

    response = requests.get(base_url, params=params, headers=headers)
    extracted_info = {}

    if response.status_code == 200:
        try:
            # Extract the first search result and get the product info
            product = response.json().get('products', [{}])[0]
            extracted_info["status"] = "Data Found"

            # Fill extracted_info with data from product for each field in fields_to_query
            for field in fields_to_query:
                extracted_info[field] = product.get(field, 'Not available')

        except:
            # If no result was found return dict with no result in it 
            extracted_info["status"] = "No Data in Website"

        return extracted_info

    else:
        extracted_info["status"] = "API Call Unsuccessful"
        return extracted_info


In [3]:
field_options = {
    'additives_n': True,
    'additives_tags': True,
    'allergens': True,
    'brands': True,
    'categories': True,
    'ecoscore_grade': True,
    'ingredients_n': True,
    'ingredients_text': True,
    'labels': True,
    'labels_hierarchy': True,
    'nutrition_grades': True,
    'nova_group': True,
    'nutriments': True,
    'packaging_recycling_tags': True,
    'packaging_tags': True,
    'product_name': True,
    'stores_tags': True,
}

product_info = query_openfoodfacts_and_extract("chocotastic tart", field_options)
print(product_info)


{'status': 'Data Found', 'additives_n': 4, 'additives_tags': ['en:e415', 'en:e422', 'en:e450', 'en:e500', 'en:e500ii'], 'allergens': 'en:gluten,en:soybeans', 'brands': "Kellog's,Poptarts,Pop Tarts,Pop-Tarts,Kellogs", 'categories': 'Imbiss, Süßer Snack, Kekse und Kuchen, Kekse, Toastergebäck', 'ecoscore_grade': 'd', 'ingredients_n': 28, 'ingredients_text': '_Weizenmehl_, Zucker, Dextrose, Palmöl, Glukosesirup, _Süßmolkenpulver_, fettarmes Kakaopulver, Feuchthaltemittel (Glycerin), Invertzuckersirup, _Weizenstärke_, Salz, Backtriebmittel (Natriumhydrogencarbonat, E450), Rindergelatine, _Trockeneiklar_, Stärke, Stabilisator (Xanthan), Emulgator (_Sojalecithine_), brauner Reissirup, Niacin, Eisen, Thiamin, Riboflavin, Folsäure', 'labels': 'Ohne künstliche Aromen, Grüner Punkt, Ohne künstliche Farbstoffe, Keine künstlichen Farbstoffe oder Aromen, Ohne Farbstoffzusatz', 'labels_hierarchy': ['en:no-artificial-flavors', 'en:green-dot', 'en:no-artificial-colors', 'en:no-artificial-colours-or-fl

In [242]:
"""
To Do:

Tidy up code
Change selection so we do a fuzzy name match and choose closest search result, not first one
Look at extracting less data (look for duplicated stuff)
structure output into something that can be fed into LLM
run this for all items (ensure it doesn't run if item exact name has been run before, maybe in the last year?)
build prompt to pass this into LLM for categorisation (if it hasn't been run before, maybe in the last year?)
look at how i can pass in categories to LLM only once, to save tokens 
maybe LLM should only pass out the L3 and L2 and L1 auto populate to save tokens 


FOUND THAT SEARCH IS DONE ON FULL WORD FUZZY MATCH, ABOVE x% 

"""

In [4]:
import os
import pandas as pd

file_path = r"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\4 - Processed Data Files\benedict.s@hotmail.co.uk"
file_name = "benedict.s@hotmail.co.uk_transactions.xlsx"

# Correctly join the file path and file name
full_file_path = os.path.join(file_path, file_name)

# Correct the argument name from FilePath to io
transaction_df = pd.read_excel(io=full_file_path)

In [18]:
transaction_df["openfoodfacts_formatted"] = None

In [23]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Example setup (ensure you have your actual DataFrame and predefined variables)
# transaction_df = pd.DataFrame({'name': ['product1', 'product2', 'product3']})
# field_options = ...

def parallel_query(row):
    # This wrapper function is used to pass the row to your predefined function
    return query_openfoodfacts_and_extract(row, field_options)

# List to hold the results
results = [None] * len(transaction_df)  # Pre-allocate list for results

# Number of threads in the pool - adjust based on your system and the API's rate limits
num_threads = 5

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Prepare a dictionary to hold future-to-index mappings
    future_to_index = {executor.submit(parallel_query, formatted_name): index for index, formatted_name in enumerate(transaction_df['formatted_name'])}
    
    # Process futures as they complete
    for future in tqdm(as_completed(future_to_index), total=len(future_to_index), desc="Processing API Calls"):
        index = future_to_index[future]
        try:
            results[index] = future.result()
        except Exception as exc:
            print(f"API call generated an exception: {exc}")
            results[index] = None  # or a default value

# Assigning the results to the DataFrame
transaction_df['openfoodfacts_formatted'] = results

Processing API Calls: 100%|██████████| 1141/1141 [02:54<00:00,  6.53it/s]


In [15]:
# Assuming transaction_df['openfoodfacts'] contains JSON objects
count_data_found = transaction_df['openfoodfacts'].apply(lambda x: x.get('status') == 'Data Found').sum()

print(f"Number of JSONs with data found {count_data_found}")

Number of JSONs with data found 170


In [20]:
# Assuming transaction_df['openfoodfacts'] contains JSON objects
count_data_found = transaction_df['openfoodfacts_formatted'].apply(lambda x: x.get('status') == 'Data Found').sum()

print(f"Number of JSONs with data found {count_data_found}")

Number of JSONs with data found 542


In [24]:
# Assuming transaction_df['openfoodfacts'] contains JSON objects
count_data_found = transaction_df['openfoodfacts_formatted'].apply(lambda x: x.get('status') == 'Data Found').sum()

print(f"Number of JSONs with data found {count_data_found}")

Number of JSONs with data found 630


In [22]:
def format_name(name):
    # Step 1: Trim trailing whitespace
    trimmed_name = name.strip()
    
    # Step 2: Remove "Tesco" from the string
    # Note: Using case-insensitive replacement to remove all occurrences of "Tesco"
    name_without_tesco = re.sub(r'\bTesco\b', '', trimmed_name, flags=re.IGNORECASE).strip()
    
    # Step 3: Remove the last word if it contains a non-letter character
    words = name_without_tesco.split()
    if words and not words[-1].isalpha():
        words.pop()  # Remove the last word if it contains non-letter characters
    
    # Join the words back into a string and return
    return ' '.join(words)

# Apply the function to the 'name' column and store the result in 'formatted_name'
transaction_df['formatted_name'] = transaction_df['name'].apply(format_name)

transaction_df

Unnamed: 0,name,quantity,channel,weightInGrams,price,volumeInLitres,timeStamp,customer_email,openfoodfacts,formatted_name,openfoodfacts_formatted
0,Hellmann's Light Mayonnaise Squeezy 650Ml,1,IN_STORE,,3.10,,2024-01-26 21:04:04.307,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},Hellmann's Light Mayonnaise Squeezy,{'status': 'No Data in Website'}
1,Express Tesco Egg Custard Tart 2 Pack,1,IN_STORE,,0.95,,2024-01-26 21:04:04.307,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},Express Egg Custard Tart 2 Pack,{'status': 'No Data in Website'}
2,Tesco Loose Red Peppers(C),1,IN_STORE,,0.65,,2024-01-22 12:26:33.803,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},Loose Red,{'status': 'No Data in Website'}
3,Tesco Sweetheart Cabbage (C),1,IN_STORE,,0.75,,2024-01-22 12:26:33.803,benedict.s@hotmail.co.uk,"{'status': 'Data Found', 'additives_n': 'Not a...",Sweetheart Cabbage,"{'status': 'Data Found', 'additives_n': 'Not a..."
4,Tesco Carrot 500G (C),1,IN_STORE,,0.45,,2024-01-22 12:26:33.803,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},Carrot 500G,{'status': 'No Data in Website'}
...,...,...,...,...,...,...,...,...,...,...,...
1136,"Gillette Fusion 5, Razor & 10 Blades Set",1,IN_STORE,,27.99,,2021-07-13 18:36:44.345,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},"Gillette Fusion 5, Razor & 10 Blades Set",{'status': 'No Data in Website'}
1137,Tesco Bananas Loose,1,IN_STORE,,0.73,,2021-07-13 18:36:44.345,benedict.s@hotmail.co.uk,"{'status': 'Data Found', 'additives_n': 'Not a...",Bananas Loose,"{'status': 'Data Found', 'additives_n': 'Not a..."
1138,Tesco Smoky Olives With Red Pepper 220G,1,IN_STORE,,2.50,,2021-07-13 18:36:44.345,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},Smoky Olives With Red Pepper,{'status': 'No Data in Website'}
1139,Tesco Medium Free Range Eggs 12 Pack,1,IN_STORE,,1.69,,2021-07-13 18:36:44.345,benedict.s@hotmail.co.uk,{'status': 'No Data in Website'},Medium Free Range Eggs 12 Pack,{'status': 'No Data in Website'}


In [None]:
"""

Foodfacts works best if you have the barcode of the item (like Yuka)
See if I can get around this to probe the database with the item, or obtain the barcodes

"""