In [44]:
import os
import pandas as pd
import json
import csv
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from typing import List, Literal
import asyncio


import nest_asyncio

nest_asyncio.apply()  # Allows asyncio to work inside Jupyter


In [45]:
# Constants
OUTPUT_FILE = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\3) Outputs\All items\RAW.csv"
VALID_OUTPUT_FILE = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\3) Outputs\All items\CATEGORISED.csv"
INVALID_OUTPUT_FILE = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\3) Outputs\All items\INVALID_TEST.csv"
BATCH_SIZE = 2  # Number of items per batch

# OpenAI Client
gpt_api_key = "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" 
client = AsyncOpenAI(api_key=gpt_api_key)
gpt_model = "ft:gpt-4o-mini-2024-07-18:personal::BCOsoAm5"

# File paths
items_file_path = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\2) Data Preparations\all_items.xlsx"
validation_categories_file_path = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\2) Data Preparations\Categories.xlsx"

# Fixed system instruction
system_message = (
    "Categorise this item strictly using only the taxonomy from your training data. "
    "Select the closest Level 3 category (most granular) and assign the corresponding fixed Level 2 (mid-level) and Level 1 (broad) categories. "
    "Include the best matching characteristics and flavours. "
    "Do not assign anything not present in your training data. "
    "Output must match the JSON structure from the training dataset."
)

# Required keys expected in GPT output
required_keys = ["category_3", "category_2", "category_1", "characteristics", "flavours"]

In [46]:
# Load item file
df_items = pd.read_excel(items_file_path)

# Load valid categories from the Excel file
valid_L1 = set(pd.read_excel(validation_categories_file_path, sheet_name="Category")["Level 1"].dropna().astype(str).unique())
valid_L2 = set(pd.read_excel(validation_categories_file_path, sheet_name="Category")["Level 2"].dropna().astype(str).unique())
valid_L3 = set(pd.read_excel(validation_categories_file_path, sheet_name="Category")["Level 3"].dropna().astype(str).unique())
valid_characteristics = set(pd.read_excel(validation_categories_file_path, sheet_name="Characteristic")["Characteristic"].dropna().astype(str).unique())
valid_flavours = set(pd.read_excel(validation_categories_file_path, sheet_name="Flavour")["Flavour"].dropna().astype(str).unique())

class CategorySchema(BaseModel):
    category_3: Literal[*valid_L3]  # Strict validation for Level 3
    category_2: str  # Allow any string for Level 2 (max 500 allowed across all schema)
    category_1: Literal[*valid_L1]    # Strict validation for Level 1
    characteristics: List[Literal[*valid_characteristics]] = Field(default_factory=list)
    flavours: List[Literal[*valid_flavours]] = Field(default_factory=list)


In [43]:
# from openai import OpenAI

# client = OpenAI(api_key=gpt_api_key)

# def prompt_gpt(item_name):
#     """Processes a single product name using OpenAI API with response validation."""
#     messages = [
#         {"role": "system", "content": system_message},
#         {"role": "user", "content": item_name}
#     ]
    
#     response = client.beta.chat.completions.parse(
#         model=gpt_model,
#         messages=messages,
#         max_tokens=100,
#         temperature=0,
#         response_format=CategorySchema,
#     )
    
#     try:
#         return response.choices[0].message.parsed.dict()
#     except Exception as e:
#         print(f"Validation failed for item {item_name}: {e}")
#         return None
    
# response_out = prompt_gpt("Apple pie")

# response_out

In [47]:
# Ensure result files have headers if they don't exist
for file in [OUTPUT_FILE, VALID_OUTPUT_FILE, INVALID_OUTPUT_FILE]:
    if not os.path.exists(file):
        with open(file, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            if file == OUTPUT_FILE:
                writer.writerow(["UID", "product name", "Output"])
            else:
                writer.writerow(["UID", "category_3", "attribute type", "attribute value", "value"])

In [48]:
async def prompt_gpt_batch(uids, item_names):
    """Processes a batch of product names using create API calls, maintaining order."""

    tasks = []
    for uid, item in zip(uids, item_names):
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": item}
        ]
        tasks.append(client.beta.chat.completions.parse(
            model=gpt_model,
            messages=messages,
            max_tokens=100,
            temperature=0,
            response_format=CategorySchema,
        ))

    responses = await asyncio.gather(*tasks)  # Run all requests in parallel

    outputs = [r.choices[0].message.content for r in responses]  # Extract all responses
    return list(zip(uids, item_names, outputs))  # Maintain correct UID-output mapping


async def process_items():
    """Processes df_items in batches using create and saves results to CSV."""
    
    total_batches = (len(df_items) + BATCH_SIZE - 1) // BATCH_SIZE
    print(f"Total items: {len(df_items)}, Total batches: {total_batches}")

    for i in range(0, len(df_items), BATCH_SIZE):
        batch = df_items.iloc[i:i + BATCH_SIZE]
        uids, item_names = batch["UID"].tolist(), batch["product name"].tolist()

        try:
            print(f"Processing batch {i//BATCH_SIZE + 1}/{total_batches}")
            batch_results = await prompt_gpt_batch(uids, item_names)  # Call API
        except Exception as e:
            print(f"Error in batch {i//BATCH_SIZE + 1}: {e}")
            batch_results = [(uid, item, "Error") for uid, item in zip(uids, item_names)]

        # Save batch results to results.csv
        with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerows(batch_results)

def validate_responses():
    """Reads the results file, validates responses, and saves valid/invalid outputs."""
    df_results = pd.read_csv(OUTPUT_FILE)
    valid_entries, invalid_entries = [], []

    for _, row in df_results.iterrows():
        uid, item_name, response = row["UID"], row["product name"], row["Output"]

        try:
            data = json.loads(response)
            if not isinstance(data, dict):
                raise ValueError("Not a dictionary")
        except (json.JSONDecodeError, ValueError, TypeError):
            invalid_entries.append({"UID": uid, "product name": item_name, "Reason": "Invalid JSON"})
            continue

        parsed_data = {key: data[key] if isinstance(data.get(key), (str, list)) else [] for key in required_keys}

        # Check if required category_3 exists
        L3_value = parsed_data["category_3"]
        if L3_value not in valid_L3:
            invalid_entries.append({"UID": uid, "product name": item_name, "Reason": f"Invalid L3 Category: {L3_value}"})
            continue

        # Validate characteristics and flavours
        for attr_type, valid_set in [("characteristics", valid_characteristics), ("flavours", valid_flavours)]:
            for attr in parsed_data.get(attr_type, []):
                if attr in valid_set:
                    valid_entries.append({"UID": uid, "category_3": L3_value, "Attribute Type": attr_type.capitalize(), "Attribute Value": attr, "Value": 1})
                else:
                    invalid_entries.append({"UID": uid, "product name": item_name, "Reason": f"Invalid {attr_type.capitalize()}: {attr}"})

    # Save valid entries to CSV
    if valid_entries:
        pd.DataFrame(valid_entries).to_csv(VALID_OUTPUT_FILE, mode="a", index=False, header=False)

    # Save invalid entries to CSV
    if invalid_entries:
        pd.DataFrame(invalid_entries).to_csv(INVALID_OUTPUT_FILE, mode="a", index=False, header=False)

    print("Validation complete. Files saved.")

In [49]:
df_items = df_items.iloc[:6]

In [50]:
async def main():
    await process_items()  # Run batch processing
    validate_responses()  # Validate and save

asyncio.run(main())  # Execute the full process


Total items: 6, Total batches: 3
Processing batch 1/3
Processing batch 2/3
Processing batch 3/3
Validation complete. Files saved.


In [51]:
df_items

Unnamed: 0,UID,product name
0,ID_1,Hellmann's Light Mayonnaise Squeezy 650Ml
1,ID_2,Express Tesco Egg Custard Tart 2 Pack
2,ID_3,Tesco Loose Red Peppers(C)
3,ID_4,Tesco Sweetheart Cabbage (C)
4,ID_5,Tesco Carrot 500G (C)
5,ID_6,Tesco British Chicken Breast Mini Fillets 400G


In [None]:
df_results