In [34]:
import os
import pandas as pd
import json
import csv
from openai import AsyncOpenAI
from datetime import datetime
import asyncio


import nest_asyncio

nest_asyncio.apply()  # Allows asyncio to work inside Jupyter


In [35]:
# Constants
OUTPUT_FILE = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\3) Outputs\All items\raw_responses.csv"
VALID_OUTPUT_FILE = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\3) Outputs\All items\categorised_products.csv"
INVALID_OUTPUT_FILE = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\3) Outputs\All items\invalid_results.csv"
BATCH_SIZE = 5  # Number of items per batch

# OpenAI Client
gpt_api_key = "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" 
client = AsyncOpenAI(api_key=gpt_api_key)
gpt_model = "ft:gpt-4o-mini-2024-07-18:personal::B9eg9Tmn"

# File paths
items_file_path = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\2) Data Preparations\all_items.xlsx"
validation_categories_file_path = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\2) Data Preparations\Categories.xlsx"

# Fixed system instruction
system_message = (
    "Categorise this item strictly using only the taxonomy from your training data. "
    "Select the closest Level 3 category (most granular) and assign the corresponding fixed Level 2 (mid-level) and Level 1 (broad) categories. "
    "Include the best matching characteristics and flavours. "
    "Do not assign anything not present in your training data. "
    "Output must match the JSON structure from the training dataset."
)

# Required keys expected in GPT output
required_keys = ["Level 3", "Level 2", "Level 1", "characteristics", "flavours"]

In [36]:
# Load item file
df_items = pd.read_excel(items_file_path)

# Load valid categories from the Excel file
valid_L3 = set(pd.read_excel(validation_categories_file_path, sheet_name="Category")["Level 3"].dropna().astype(str).unique())
valid_characteristics = set(pd.read_excel(validation_categories_file_path, sheet_name="Characteristic")["Characteristic"].dropna().astype(str).unique())
valid_flavours = set(pd.read_excel(validation_categories_file_path, sheet_name="Flavour")["Flavour"].dropna().astype(str).unique())


In [38]:
# Ensure result files have headers if they don't exist
for file in [OUTPUT_FILE, VALID_OUTPUT_FILE, INVALID_OUTPUT_FILE]:
    if not os.path.exists(file):
        with open(file, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            if file == OUTPUT_FILE:
                writer.writerow(["UID", "Product name", "Output"])
            else:
                writer.writerow(["UID", "Level 3", "Attribute Type", "Attribute Value", "Value"])

In [None]:
async def prompt_gpt_batch(uids, item_names):
    """Processes a batch of product names using create API calls, maintaining order."""

    tasks = []
    for uid, item in zip(uids, item_names):
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": item}
        ]
        tasks.append(client.chat.completions.create(
            model=gpt_model,
            messages=messages,
            max_tokens=100,
            temperature=0
        ))

    responses = await asyncio.gather(*tasks)  # Run all requests in parallel

    outputs = [r.choices[0].message.content for r in responses]  # Extract all responses
    return list(zip(uids, item_names, outputs))  # Maintain correct UID-output mapping


async def process_items():
    """Processes df_items in batches using create and saves results to CSV."""
    
    total_batches = (len(df_items) + BATCH_SIZE - 1) // BATCH_SIZE
    print(f"Total items: {len(df_items)}, Total batches: {total_batches}")

    for i in range(0, len(df_items), BATCH_SIZE):
        batch = df_items.iloc[i:i + BATCH_SIZE]
        uids, item_names = batch["UID"].tolist(), batch["Product name"].tolist()

        try:
            print(f"Processing batch {i//BATCH_SIZE + 1}/{total_batches}")
            batch_results = await prompt_gpt_batch(uids, item_names)  # Call API
        except Exception as e:
            print(f"Error in batch {i//BATCH_SIZE + 1}: {e}")
            batch_results = [(uid, item, "Error") for uid, item in zip(uids, item_names)]

        # Save batch results to results.csv
        with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerows(batch_results)

def validate_responses():
    """Reads the results file, validates responses, and saves valid/invalid outputs."""
    df_results = pd.read_csv(OUTPUT_FILE)
    valid_entries, invalid_entries = [], []

    for _, row in df_results.iterrows():
        uid, item_name, response = row["UID"], row["Product name"], row["Output"]

        try:
            data = json.loads(response)
            if not isinstance(data, dict):
                raise ValueError("Not a dictionary")
        except (json.JSONDecodeError, ValueError, TypeError):
            invalid_entries.append({"UID": uid, "Product name": item_name, "Reason": "Invalid JSON"})
            continue

        parsed_data = {key: data[key] if isinstance(data.get(key), (str, list)) else [] for key in required_keys}

        # Check if required Level 3 exists
        L3_value = parsed_data["Level 3"]
        if L3_value not in valid_L3:
            invalid_entries.append({"UID": uid, "Product name": item_name, "Reason": f"Invalid L3 Category: {L3_value}"})
            continue

        # Validate characteristics and flavours
        for attr_type, valid_set in [("characteristics", valid_characteristics), ("flavours", valid_flavours)]:
            for attr in parsed_data.get(attr_type, []):
                if attr in valid_set:
                    valid_entries.append({"UID": uid, "Level 3": L3_value, "Attribute Type": attr_type.capitalize(), "Attribute Value": attr, "Value": 1})
                else:
                    invalid_entries.append({"UID": uid, "Product name": item_name, "Reason": f"Invalid {attr_type.capitalize()}: {attr}"})

    # Save valid entries to CSV
    if valid_entries:
        pd.DataFrame(valid_entries).to_csv(VALID_OUTPUT_FILE, mode="a", index=False, header=False)

    # Save invalid entries to CSV
    if invalid_entries:
        pd.DataFrame(invalid_entries).to_csv(INVALID_OUTPUT_FILE, mode="a", index=False, header=False)

    print("Validation complete. Files saved.")

In [40]:
async def main():
    await process_items()  # Run batch processing
    validate_responses()  # Validate and save

asyncio.run(main())  # Execute the full process


Total items: 10, Total batches: 2
Processing batch 1/2
Processing batch 2/2
Validation complete. Files saved.
