In [7]:
import pandas as pd
import json
import time
from datetime import datetime
from openai import OpenAI

# -------------------------------
# SETUP
# -------------------------------
gpt_api_key = "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" 
gpt_model = "gpt-4o-mini-2024-07-18"  # or change to a specific fine-tunable snapshot if desired
input_file = r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\2) Data\2) Data Preparations\synthetic_items.csv"
training_jsonl_filename = "training_data.jsonl"

# Set your API key
client = OpenAI(api_key=gpt_api_key)

# -------------------------------
# DATA PREPARATION
# -------------------------------
# Read the CSV file
df = pd.read_csv(input_file)

# Optional: Shuffle the data so each epoch sees a random order
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

output_list = []

system_message = (
    "Categorise this item strictly using only the taxonomy from your training data. "
    "Use only pre-existing categories and never create new ones. "
    "If the exact category is unclear, pick the closest valid Level 3 category (most granular) rather than inventing a new one. "
    "Assign its corresponding fixed Level 2 (mid-level) and Level 1 (broad) categories. "
    "Ensure characteristics and flavours are only from the predefined lists. "
    "Output your answer in valid JSON with keys: category_3, category_2, category_1, characteristics, and flavours."
)

# Process each row to build a training example
for _, row in df.iterrows():
    # Extract item name (user message) and hierarchical levels
    item_name = row["Item Name"].strip()
    level1 = row["L1"].strip() if pd.notnull(row["L1"]) else ""
    level2 = row["L2"].strip() if pd.notnull(row["L2"]) else ""
    level3 = row["L3"].strip() if pd.notnull(row["L3"]) else ""
    
    # Build the characteristics list (from Tag1 to Tag5)
    characteristics = []
    for i in range(1, 6):
        tag = row.get(f"Tag{i}")
        if pd.notnull(tag):
            characteristics.append(str(tag).strip())
    
    # Build the flavours list (from Flav1 to Flav5; expect at least one)
    flavours = []
    for i in range(1, 6):
        flav = row.get(f"Flav{i}")
        if pd.notnull(flav):
            flavours.append(str(flav).strip())
    
    # Build the conversation in the required format:
    # - System message: fixed instruction
    # - User message: item name only
    # - Assistant message: expected output as a JSON string
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": item_name},
        {"role": "assistant", "content": json.dumps({
            "category_3": level3,
            "category_2": level2,
            "category_1": level1,
            "characteristics": characteristics,
            "flavours": flavours
        }, ensure_ascii=False)}
    ]
    
    output_list.append({"messages": messages})

# Write the training data to a JSONL file (one JSON object per line)
with open(training_jsonl_filename, "w", encoding="utf8") as f:
    for entry in output_list:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Training data written to {training_jsonl_filename}")




Training data written to training_data.jsonl


In [10]:
# -------------------------------
# UPLOAD TRAINING FILE & LAUNCH FINE-TUNING JOB
# -------------------------------
# Upload the JSONL training file to OpenAI
with open(training_jsonl_filename, "rb") as f:
    file_response = client.files.create(file=f, purpose="fine-tune") 

training_file_id = file_response.id  # Corrected to access `.id` directly
print("Uploaded training file. File ID:", training_file_id)


# Create the fine-tuning job with the recommended hyperparameters:
#  - n_epochs: 4
#  - batch_size: 16
#  - learning_rate_multiplier: 1.0
job = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model=gpt_model,
    method={
        "type": "supervised",
        "supervised": {
            "hyperparameters": {
                "n_epochs": 5,
                "batch_size": 20,
                "learning_rate_multiplier": 0.6
            }
        }
    }
)
print("Fine-tuning job created. Job ID:", job.id)

Uploaded training file. File ID: file-EaVB3mxcZKy4yxgRHPyHcU
Fine-tuning job created. Job ID: ftjob-UIxbT5qmhYTSkfCJQltLNVOp


In [11]:
# -------------------------------
# RETRIEVE LATEST FINE-TUNING JOB
# -------------------------------
print("Fetching latest fine-tuning job...")

try:
    latest_job = client.fine_tuning.jobs.list(limit=1).data[0]  # Get the most recent job
    fine_tuning_job_id = latest_job.id  # Extract job ID
    print(f"Latest fine-tuning job ID: {fine_tuning_job_id}")
except Exception as e:
    print("Error retrieving latest fine-tuning job:", e)
    exit()

# -------------------------------
# POLLING LOOP FOR JOB STATUS
# -------------------------------
print("Polling fine-tuning job status...")

while True:
    try:
        current_job = client.fine_tuning.jobs.retrieve(fine_tuning_job_id)
        status = current_job.status  # Correctly accessing job status
        print(f"{datetime.now()} - Job status: {status}")

        if status == "succeeded":
            fine_tuned_model = current_job.fine_tuned_model
            print("Fine-tuning completed successfully! Fine-tuned model:", fine_tuned_model)
            break
        elif status in ["failed", "cancelled"]:
            print("Fine-tuning job did not succeed. Status:", status)
            break

    except Exception as e:
        print("Error retrieving job status:", e)

    time.sleep(30)  # Wait 30 seconds before checking again


Fetching latest fine-tuning job...
Latest fine-tuning job ID: ftjob-UIxbT5qmhYTSkfCJQltLNVOp
Polling fine-tuning job status...
2025-03-18 09:54:07.928383 - Job status: validating_files
2025-03-18 09:54:38.358110 - Job status: validating_files
2025-03-18 09:55:08.589170 - Job status: validating_files
2025-03-18 09:55:38.870023 - Job status: validating_files
2025-03-18 09:56:09.136998 - Job status: validating_files
2025-03-18 09:56:39.438987 - Job status: validating_files
2025-03-18 09:57:09.719827 - Job status: validating_files
2025-03-18 09:57:39.969836 - Job status: validating_files
2025-03-18 09:58:10.238560 - Job status: validating_files
2025-03-18 09:58:40.504226 - Job status: validating_files
2025-03-18 09:59:10.769189 - Job status: validating_files
2025-03-18 09:59:41.049892 - Job status: validating_files
2025-03-18 10:00:11.350096 - Job status: running
2025-03-18 10:00:41.650408 - Job status: running
2025-03-18 10:01:11.916395 - Job status: running
2025-03-18 10:01:42.219917 - J