# Config

In [None]:
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
load_dotenv()


OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
from pydantic import BaseModel

class Item(BaseModel):
    id: str # the id of the item
    class Config:
        extra = "allow"

class Category(BaseModel):
    name: str # snake_case name of the category
    description: str # description of the category, what it represents

class ClassifiedItem(BaseModel):
    item_id: str # the id of the item
    category_name: str # the name of the category


# Proof of Concept

In [None]:
raw_items = ["apple", "banana", "carrot", "dog", "broccoli"]
items = [Item(id=item) for item in raw_items]
categories = [
  Category(name="fruit", description="fruits"),
  Category(name="vegetable", description="vegetables"),
  Category(name="animal", description="animals"),
  Category(name="mineral", description="minerals")
]

prompt = f"""I will provide you with items and categories. You need to classify the items into the correct category.
ITEMS:
```
{[i.dict() for i in items]}
```
CATEGORIES:
```
{[c.dict() for c in categories]}
```""" 
res = client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
		{"role": "user", "content": prompt}
	],
	tools=[
		{
			"type": "function",
			"function": {
					"name": "classifier",
          "description": "Classify items into categories",
					"strict": True,
					"parameters": {
							"type": "object",
              "properties": {
								"classified_items": {
									"type": "array",
									"items": {
										"type": "object",
										"properties": {
											"item_id": {
												"type": "string",
												"description": "The id of the item"
											},
											"category_name": {
												"type": "string",
												"description": "The name of the category",
												"enum": [c.name for c in categories]
											},
										},
										"required": ["item_id", "category_name"],
                    "additionalProperties": False
									},
								},
							},
							"required": ["classified_items"],
							"additionalProperties": False,
					},
			},
		}  
	],
	tool_choice={"type": "function", "function": {"name": "classifier"}},
)
json_res = json.loads(res.choices[0].message.tool_calls[0].function.arguments)
print(json.dumps(json_res, indent=2))

In [None]:
max_categories = 7
categorization_method = "categorize in a way so that there is minumum potential overlap between categories"
raw_items = ["apple", "banana", "carrot", "dog", "broccoli"]
items = [Item(id=item) for item in raw_items]
prompt = f"""I will provide you with items. You need to create categories according to the following guideline
```
There must be at most {max_categories} categories.
{categorization_method}
```
ITEMS:
```
{[i.dict() for i in items]}
```""" 
res = client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
		{"role": "user", "content": prompt}
	],
	tools=[
		{
			"type": "function",
			"function": {
					"name": "category_creator",
          "description": "Create categories for items",
					"strict": True,
					"parameters": {
							"type": "object",
              "properties": {
								"categories": {
									"type": "array",
									"items": {
										"type": "object",
										"properties": {
											"name": {
												"type": "string",
												"description": "Snake case name of the category"
											},
											"description": {
												"type": "string",
												"description": "The description of the category"
											},
										},
										"required": ["name", "description"],
                    "additionalProperties": False
									},
								},
							},
							"required": ["categories"],
							"additionalProperties": False,
					},
			},
		}  
	],
	tool_choice={"type": "function", "function": {"name": "category_creator"}},
)
json_res = json.loads(res.choices[0].message.tool_calls[0].function.arguments)
print(json.dumps(json_res, indent=2))

In [None]:
res.choices[0].message

# System

In [None]:
from typing import List, Optional

class TreeNode:
    def __init__(self, category: Category):
        self.category = category
        self.items: List[Item] = []
        self.children: List[TreeNode] = []
        self.parent: Optional[TreeNode] = None

    def add_child(self, child: 'TreeNode'):
        child.parent = self
        self.children.append(child)

    def add_item(self, item: Item):
        self.items.append(item)

    def remove_child(self, child: 'TreeNode'):
        if child in self.children:
            self.children.remove(child)
            child.parent = None

    def find_subcategory(self, category_name: str) -> Optional['TreeNode']:
        if self.category.name == category_name:
            return self
        for child in self.children:
            result = child.find_subcategory(category_name)
            if result:
                return result
        return None

    def __repr__(self):
        return f"TreeNode(category={self.category.name}, items_count={len(self.items)}, children_count={len(self.children)})"

In [None]:
# Create root category
root_category = Category(name="root", description="Root category")
root_node = TreeNode(root_category)

# Add subcategories
electronics = TreeNode(Category(name="electronics", description="Electronic devices"))
root_node.add_child(electronics)

phones = TreeNode(Category(name="phones", description="Mobile phones"))
electronics.add_child(phones)

# Add items
phone_item = Item(id="1", brand="Apple", model="iPhone 12")
phones.add_item(phone_item)

# Find a category
found_category = root_node.find_subcategory("phones")
print(found_category)  # Should print the 'phones' TreeNode

In [None]:
from time import sleep

class CreateCategoriesChat:
    def __init__(self, client: OpenAI, tree_node: TreeNode, max_categories: int, categorization_method: str):
        self.tree_node = tree_node
        self.max_categories = max_categories
        self.categorization_method = categorization_method
        self.chat_history = []
        self.client = client
        self.initialize_chat()

    def initialize_chat(self):
        items = self.tree_node.items
        prompt = f"""I will provide you with items. You need to create categories according to the following guideline
```
There must be at most {self.max_categories} categories.
{self.categorization_method}
```
ITEMS:
```
{[item.dict() for item in items]}
```"""
        self.chat_history = [{"role": "user", "content": prompt}]

    def generate_categories(self) -> List[Category]:
        if self.chat_history[-1]["role"] != "user":
            raise ValueError("The latest message must be a user message before generating categories.")

        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.chat_history,
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "category_creator",
                        "description": "Create categories for items",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "categories": {
                                    "type": "array",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "name": {
                                                "type": "string",
                                                "description": "Snake case name of the category"
                                            },
                                            "description": {
                                                "type": "string",
                                                "description": "The description of the category"
                                            },
                                        },
                                        "required": ["name", "description"],
                                        "additionalProperties": False
                                    },
                                },
                            },
                            "required": ["categories"],
                            "additionalProperties": False,
                        },
                    },
                }
            ],
            tool_choice={"type": "function", "function": {"name": "category_creator"}},
        )

        # Add assistant's response to chat history
        self.chat_history.append(response.choices[0].message.model_dump())

        # Add placeholder tool response
        tool_call = response.choices[0].message.tool_calls[0]
        self.chat_history.append({
            "role": "tool",
            "content": json.dumps({"status": "ok"}),
            "tool_call_id": tool_call.id
        })

        # Parse and return categories
        categories_data = json.loads(tool_call.function.arguments)
        return [Category(**cat) for cat in categories_data["categories"]]

    def submit_feedback(self, feedback: str):
        self.chat_history.append({"role": "user", "content": feedback})

    def get_latest_categories(self) -> List[Category]:
        for message in reversed(self.chat_history):
            if message["role"] == "assistant" and message.get("tool_calls"):
                tool_call = message["tool_calls"][0]
                categories_data = json.loads(tool_call["function"]["arguments"])
                return [Category(**cat) for cat in categories_data["categories"]]
        return []

    def run_category_generation_flow(self, max_iterations: int = 3):
        for _ in range(max_iterations):
            categories = self.generate_categories()
            print(f"Generated categories for \"{self.tree_node.category.name}\":", categories)
            sleep(0.2)  # Optional delay between iterations
            
            feedback = input("Provide feedback (or type 'done' to finish): ")
            if feedback.lower() == 'done':
                break
            
            self.submit_feedback(feedback)

        return self.get_latest_categories()

In [None]:
raw_items = ["apple", "banana", "carrot", "dog", "broccoli"]
items = [Item(id=item) for item in raw_items]
tree_node = TreeNode(Category(name="root", description="Root category"))
for item in items:
    tree_node.add_item(item)
  
chat = CreateCategoriesChat(client, tree_node, max_categories=5, categorization_method="Minimize overlap between categories")
final_categories = chat.run_category_generation_flow()
print("Final categories:", final_categories)

In [None]:
class Classifier:
    def __init__(self, client: OpenAI, tree_node: TreeNode):
        self.client = client
        self.tree_node = tree_node

    def classify_items(self) -> List[ClassifiedItem]:
        categories = [child.category for child in self.tree_node.children]
        items = self.tree_node.items

        prompt = f"""I will provide you with items and categories. You need to classify the items into the correct category.
ITEMS:
```
{[item.dict() for item in items]}
```
CATEGORIES:
```
{[category.dict() for category in categories]}
```"""

        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "classifier",
                        "description": "Classify items into categories",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "classified_items": {
                                    "type": "array",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "item_id": {
                                                "type": "string",
                                                "description": "The id of the item"
                                            },
                                            "category_name": {
                                                "type": "string",
                                                "description": "The name of the category",
                                                "enum": [category.name for category in categories]
                                            },
                                        },
                                        "required": ["item_id", "category_name"],
                                        "additionalProperties": False
                                    },
                                },
                            },
                            "required": ["classified_items"],
                            "additionalProperties": False,
                        },
                    },
                }
            ],
            tool_choice={"type": "function", "function": {"name": "classifier"}},
        )

        classified_items_data = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
        return [ClassifiedItem(**item) for item in classified_items_data["classified_items"]]

    def update_tree(self, classified_items: List[ClassifiedItem]):
        for classified_item in classified_items:
            item = next((item for item in self.tree_node.items if item.id == classified_item.item_id), None)
            if item:
                category_node = self.tree_node.find_subcategory(classified_item.category_name)
                if category_node:
                    category_node.add_item(item)
                    # self.tree_node.items.remove(item) # Remove the item from the root node

    def run_classification(self):
        classified_items = self.classify_items()
        self.update_tree(classified_items)
        return classified_items

In [None]:
def print_tree(node, level=0):
    print("  " * level + f"{node.category.name}: {[item.id for item in node.items]}")
    for child in node.children:
        print_tree(child, level + 1)

In [None]:
raw_items = ["apple", "banana", "carrot", "dog", "broccoli"]
items = [Item(id=item) for item in raw_items]
tree_node = TreeNode(Category(name="root", description="Root category"))
for item in items:
    tree_node.add_item(item)

chat = CreateCategoriesChat(client, tree_node, max_categories=5, categorization_method="Minimize overlap between categories")
final_categories = chat.run_category_generation_flow()
print("Final categories:", final_categories)

# Add the generated categories as children to the root node
for category in final_categories:
    tree_node.add_child(TreeNode(category))

# Run the classification
classifier = Classifier(client, tree_node)
classified_items = classifier.run_classification()

print("\nClassified Items:")
for item in classified_items:
    print(f"Item: {item.item_id}, Category: {item.category_name}")

print("\nUpdated Tree Structure:")

print_tree(tree_node)

# Implementation

In [None]:
import re
from typing import List, Dict, Any
from pydantic import BaseModel

# Assuming we have the previously defined Item, Category, TreeNode, and Classifier classes

class DocumentField(BaseModel):
    id: str
    data_type: str

def parse_document(file_path: str) -> List[DocumentField]:
    """
    Parse the document from txt format into a list of DocumentField objects.
    """
    fields = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                field_name = parts[1].strip('[]').lower()
                data_type = parts[2].strip('[]')
                fields.append(DocumentField(id=field_name, data_type=data_type))
    return fields

parse_document("consumer_dict.txt")

In [None]:
def create_classification_structure() -> TreeNode:
    """
    Create the root class and its first level of classes.
    """
    root = TreeNode(Category(name="rootCategory", description="Root category for all data fields"))
    
    categories = [
        ("personalInformation", "Includes core personal details, contact information, and demographic data."),
        ("geographicData", "Encompasses location-related information, including address details and census data."),
        ("financialInformation", "Covers all finance-related data, including credit, investments, and property information."),
        ("behavioralData", "Includes purchasing behavior, interests, hobbies, and lifestyle indicators."),
        ("vehicleAndInsuranceInformation", "Covers all data related to vehicle ownership and insurance."),
        ("socioCulturalAttributes", "Encompasses ethnicity, language, political affiliation, and donation behavior.")
    ]
    
    for name, description in categories:
        root.add_child(TreeNode(Category(name=name, description=description)))
    
    return root

def classify_fields(client: OpenAI, root: TreeNode, fields: List[DocumentField]) -> List[ClassifiedItem]:
    """
    Classify the fields into the first level categories.
    """
    items = [Item(id=field.id, data_type=field.data_type) for field in fields]
    for item in items:
        root.add_item(item)
    
    classifier = Classifier(client, root)
    return classifier.run_classification()

def generate_subcategories(client: OpenAI, root: TreeNode) -> None:
    """
    Generate one level of subcategories under each of the main categories.
    """
    for child in root.children:
        chat = CreateCategoriesChat(client, child, max_categories=5, categorization_method=f"Create subcategories for {child.category.name}")
        subcategories = chat.run_category_generation_flow()
        for subcat in subcategories:
            child.add_child(TreeNode(subcat))

def classify_into_subcategories(client: OpenAI, root: TreeNode) -> List[ClassifiedItem]:
    """
    Classify the items into the subcategories.
    """
    all_classified_items = []
    for main_category in root.children:
        classifier = Classifier(client, main_category)
        classified_items = classifier.run_classification()
        all_classified_items.extend(classified_items)
    return all_classified_items

def main(client: OpenAI, file_path: str):
    # Parse the document
    fields = parse_document(file_path)

    print("Parsed Fields:", len(fields))
    
    # Create the classification structure
    root = create_classification_structure()

    print("\nInitial Tree Structure:")
    print_tree(root)
    
    # Classify fields into main categories
    classified_items = classify_fields(client, root, fields)
    print("First-level classification:")
    for item in classified_items:
        print(f"Item: {item.item_id}, Category: {item.category_name}")
    
    # Generate subcategories
    generate_subcategories(client, root)
    
    # Classify into subcategories
    subclassified_items = classify_into_subcategories(client, root)
    print("\nSecond-level classification:")
    for item in subclassified_items:
        print(f"Item: {item.item_id}, Subcategory: {item.category_name}")
    
    # Print the final tree structure
    print("\nFinal Tree Structure:")    
    print_tree(root)


In [None]:
main(client, "consumer_dict.txt")