In [1]:
import os
import pandas as pd
import openai
from openai import OpenAI
import re
from tqdm import tqdm
from datetime import datetime

In [2]:
api_key = "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"  

client = OpenAI(api_key=api_key)

gpt3_model = "ft:gpt-3.5-turbo-0125:personal::9TsbbInd"

items_file_path = r"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\4 - Processed Data Files\all_items.xlsx"
category_file_path = r"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\2 - Training Data\Categorised Fine Tuning.xlsx"

In [3]:
items_df = pd.read_excel(items_file_path)

In [4]:
def prompt_gpt(item):

    # Prompting fine tuned 3.5 model
    completion = client.chat.completions.create(
        model=gpt3_model,
        messages=[
            {"role": "system", "content": "Using only the categories included in your fine-tuning training dataset, categorise and tag the item based on the product's nature. Max 6 Tags."},
            {"role": "user", "content": item}
        ],
        #max_tokens=150,
        temperature=0.1,  # Lower temperature for more deterministic responses
        top_p=0.2,#Probability selection of next token (0 for less diverse, 1 for max diversity)
        frequency_penalty=-1,  # To reduce repetition (-2 no penalty, 2 avoids repetition)
        presence_penalty=0,  # To encourage variety in mentioning new topics (-2 sticks to old topics, 2 gets new ones)
        stop=["\n", "<|endoftext|>"]  # Stop at end of message or when the model thinks it's the end of the text
    )

    return completion.choices[0].message.content

def parse_item_description(description):
    # Regular expression pattern to keep only alphanumerics, spaces, and slashes
    pattern = re.compile(r'[^a-zA-Z0-9 /,&\s-]')
    
    # Split the string into components based on semicolons
    parts = description.split(';')
    
    # Initialize dictionary to hold the parsed data
    parsed_data = {}
    
    # Process each part of the split description
    for part in parts:
        # Split each part into a key and value
        key, value = part.split(':')
        key = pattern.sub('', key.strip())
        value = pattern.sub('', value.strip())
        
        # Handle the 'Tags' differently since they are a list of values
        if not key == 'Category':
            # Split the tags by commas, strip spaces, and remove unwanted punctuation
            parsed_data[key] = [pattern.sub('', tag.strip()) for tag in value.split(',')]
        else:
            # Assign the value to the corresponding key in the dictionary
            parsed_data[key] = value
    
    return parsed_data['Category'], parsed_data['Flavours'], parsed_data['Tags']


def create_data_dict(item, category, flavours, tags):
    if not isinstance(tags, list) or not isinstance(flavours, list):
        raise ValueError("Tags and flavours must be lists.")

    # Prepare the data dictionary with basic fields
    data = {'Item Name': item, 'Category': category}

    # Add flavours to the data dictionary
    for i in range(1, len(flavours) + 1):
        data[f'Flavour{i}'] = flavours[i-1]
    
    # Add tags to the data dictionary
    for i in range(1, len(tags) + 1):
        data[f'Tag{i}'] = tags[i-1]
    
    return data


In [5]:
categorised_list = []

# Iterate over each item in items_df["name"] with a progress bar
for item in tqdm(items_df["name"], desc="Categorising Items"):
    retries = 0
    while retries < 3:
        try:
            # Prompt fine-tuned model with item to get categorization
            completion_string = prompt_gpt(item)

            # Clean and split category output
            category, flavours, tags = parse_item_description(completion_string)

            # Insert parsed data into the DataFrame
            categorised_list.append(create_data_dict(item, category, flavours, tags))

            break  # Exit the retry loop if successful

        except:
            retries += 1
            if retries < 3:
                print(f"Retrying {retries}/3...")
            else:
                print("Skipping")
                break

# Convert the list of dictionaries to a DataFrame
categorised_df = pd.DataFrame(categorised_list)

# Create datetime string to save data with
datetimenow = datetime.now().strftime('%d%m%y_%H%M')

categorised_df.to_excel(rf"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\4 - Processed Data Files\Categorisations\categorised_items_{datetimenow}.xlsx", index=False)


Categorising Items:   5%|▍         | 87/1871 [00:59<18:16,  1.63it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:   5%|▍         | 88/1871 [01:02<39:58,  1.35s/it]

Skipping


Categorising Items:   6%|▌         | 112/1871 [01:21<26:43,  1.10it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:   6%|▌         | 113/1871 [01:24<41:17,  1.41s/it]

Skipping


Categorising Items:  30%|███       | 570/1871 [07:41<12:38,  1.71it/s]  

Retrying 1/3...
Retrying 2/3...


Categorising Items:  31%|███       | 571/1871 [07:44<27:33,  1.27s/it]

Skipping


Categorising Items:  36%|███▋      | 681/1871 [08:57<15:21,  1.29it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  36%|███▋      | 682/1871 [09:00<26:27,  1.34s/it]

Skipping


Categorising Items:  44%|████▍     | 828/1871 [10:32<12:04,  1.44it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  44%|████▍     | 829/1871 [10:34<18:59,  1.09s/it]

Skipping


Categorising Items:  45%|████▌     | 844/1871 [10:45<13:03,  1.31it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  45%|████▌     | 845/1871 [10:47<20:45,  1.21s/it]

Skipping


Categorising Items:  51%|█████     | 956/1871 [12:00<09:44,  1.57it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  51%|█████     | 957/1871 [12:03<19:36,  1.29s/it]

Skipping


Categorising Items:  52%|█████▏    | 969/1871 [12:11<09:10,  1.64it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  52%|█████▏    | 970/1871 [12:13<15:56,  1.06s/it]

Skipping


Categorising Items:  60%|█████▉    | 1114/1871 [13:45<07:15,  1.74it/s]

Retrying 1/3...


Categorising Items:  61%|██████    | 1141/1871 [14:03<08:04,  1.51it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  61%|██████    | 1142/1871 [14:05<14:32,  1.20s/it]

Skipping


Categorising Items:  70%|██████▉   | 1308/1871 [15:49<06:29,  1.44it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  70%|██████▉   | 1309/1871 [15:51<11:07,  1.19s/it]

Skipping


Categorising Items:  71%|███████▏  | 1334/1871 [16:08<06:07,  1.46it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  71%|███████▏  | 1335/1871 [16:09<05:54,  1.51it/s]

Skipping


Categorising Items:  80%|███████▉  | 1495/1871 [17:57<03:54,  1.60it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  80%|███████▉  | 1496/1871 [18:00<08:04,  1.29s/it]

Skipping


Categorising Items:  81%|████████  | 1510/1871 [18:08<03:23,  1.78it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  81%|████████  | 1511/1871 [18:10<05:04,  1.18it/s]

Skipping


Categorising Items:  95%|█████████▌| 1780/1871 [21:19<00:50,  1.80it/s]

Retrying 1/3...
Retrying 2/3...


Categorising Items:  95%|█████████▌| 1781/1871 [21:21<01:38,  1.09s/it]

Skipping


Categorising Items: 100%|██████████| 1871/1871 [22:20<00:00,  1.40it/s]


In [6]:
categorised_df

Unnamed: 0,Item Name,Category,Flavour1,Tag1,Tag2,Tag3,Tag4,Tag5,Tag6,Flavour2
0,Hellmann's Light Mayonnaise Squeezy 650Ml,Condiments And Dressings,Savoury,Low Calorie Option,Fresh,,,,,
1,Express Tesco Egg Custard Tart 2 Pack,Other Desserts,Sweet,On The Go Food Or Drink,Low Nutritional Value,Animal Produce,,,,
2,Tesco Loose Red Peppers(C),Vegetables,Savoury,Fresh,Health Food Or Drink,Cooking Or Baking Ingredient,,,,
3,Tesco Sweetheart Cabbage (C),Vegetables,Neutral,Fresh,Health Food Or Drink,Cooking Or Baking Ingredient,,,,
4,Tesco Carrot 500G (C),Vegetables,Savoury,Fresh,Health Food Or Drink,Cooking Or Baking Ingredient,,,,
...,...,...,...,...,...,...,...,...,...,...
1852,Tunnocks Milk Chocolate Caramel Wafer 240G,Bars And Biscuits,Sweet,Low Nutritional Value,Chocolate,Low Nutritional Value,,,,
1853,COSTA EXPRESS LARGE,Coffee,Neutral,Takeaway Food Or Drink,Low Nutritional Value,,,,,
1854,Snickers The Big One 100G (C),Bars And Biscuits,Sweet,Chocolate,Low Nutritional Value,,,,,
1855,Andrex Toilet Tissue 4 Roll White,Uncategorised,Non Food Item,Untagged,,,,,,
