# Context

This course includes example code for most topics. The example code relates to building a feature that answers user questions based on previous product reviews for a hardware e-commerce website.

This notebook builds the raw product description and reviews data.

When you go apply course lessons in your business, you will not have an equivalent of this notebook. Because the data you retrieve from your database to answer questions will be real data from your business.

In [1]:
import asyncio
from typing import List, Dict
import instructor
import json
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel

client = instructor.from_openai(OpenAI())


class Product(BaseModel):
    title: str
    description: str


reviews_per_product = 10


def generate_physical_objects(n_objects=150) -> List[Product]:
    prompt = (
        f"Create a list of {n_objects} products someone might buy at a hardware store"
    )
    prompt += "Each product title should be repeated 2-3 times. Do not have any with duplicate product descriptions.\n"
    prompt += "So each product with a given title should have some small distinctions apparent from the description.\n"
    prompt += (
        "Products can be small (a screw), large (a bandsaw) or anywhere in between.\n"
    )
    prompt += "For each product, write a 2-3 sentence product description that might show up in a hardware retailers website underneath the product"
    prompt += "Do not create product reviews that contradict specific facts in other reviews. "
    prompt += "Contradicting subjective opinions in other reviews is ok only to the extent you would expect that in real data."
    prompt += "Respond only with the list of products and descriptions."

    try:
        objects = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=List[Product],
            messages=[{"role": "user", "content": prompt}],
        )
        return objects
    except Exception as e:
        print(f"Error generating evals: {str(e)}")
        return []


objects = generate_physical_objects()
print(f"Created {len(objects)} unique objects")
print(f"First 10 objects: {objects[:10]}")

Created 114 unique objects
First 10 objects: [Product(title='Cordless Drill', description='This powerful cordless drill features a lightweight design and a 2-speed transmission, allowing you to tackle various tasks with ease. Ideal for both professionals and DIY enthusiasts, it comes with two batteries for extended usage.'), Product(title='Cordless Drill', description='Designed for versatility, this cordless drill offers 18 torque settings and a compact design perfect for tight spaces. Its ergonomic grip ensures comfort during prolonged use, making it an ideal tool for any project.'), Product(title='Cordless Drill', description='Equipped with a high-performance motor, this cordless drill ensures efficient drilling and driving. The LED light illuminates dark work areas, making it perfect for both indoor and outdoor tasks.'), Product(title='Adjustable Wrench', description='This adjustable wrench features a sleek design with a cushioned grip for added comfort during use. Its wide jaw open

We have created the list of objects. Now we will create the product reviews.

In [2]:
# Patch the AsyncOpenAI client
async_client = instructor.from_openai(AsyncOpenAI())


class Review(BaseModel):
    review: str


class AllObjectInfo(BaseModel):
    product_title: str
    product_description: str
    review: str


async def make_reviews(
    product: Product, n: int, semaphore: asyncio.Semaphore = asyncio.Semaphore(1)
) -> List[AllObjectInfo]:
    async with semaphore:
        prompt = f"""
        Write {n} realistic but detailed/specific product reviews that might show up on a hardware store's website.

        The reviews should be about the following product:
        Product Title: {product.title}
        Product Description: {product.description}
        
        Add many relevant and concrete facts about the products (this is for synthetic data generation, make up facts about each product as necessary).

        To see the format of a possible review, here is a review for a saw:
        ```
        I've enjoyed using this saw. It is lightweight and the battery lasts longer than other brands.
        I've been using it for 3 years now and it has been very durable. It was twice as expensive as the PX-500. But
        it is comfortable to hold because of the light weight.
        ```

        Respond only with the reviews, and nothing else.
        """

        try:
            result = await async_client.chat.completions.create(
                model="gpt-4o",
                response_model=List[Review],
                messages=[{"role": "user", "content": prompt}],
            )
            return [
                AllObjectInfo(
                    product_title=product.title,
                    product_description=product.description,
                    review=r.review,
                )
                for r in result
            ]

        except Exception as e:
            print(f"Error generating FreeCAD code: {str(e)}")
            return []


async def create_synthetic_reviews(
    max_concurrency: int = 20, reviews_per_product: int = reviews_per_product
) -> List[AllObjectInfo]:
    out = []
    semaphore = asyncio.Semaphore(max_concurrency)
    tasks = [make_reviews(o, reviews_per_product, semaphore) for o in objects]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    for r in results:
        if not isinstance(r, Exception):
            out.extend(r)
    return out


reviews = await create_synthetic_reviews()

Store the items to be retrieved in LanceDB

In [3]:
db = lancedb.connect("./lancedb")
func = get_registry().get("openai").create(name="text-embedding-3-small")


class Products(LanceModel):
    id: str = func.SourceField()
    title: str = func.SourceField()
    description: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


products_table = db.create_table("products", schema=Products, mode="overwrite")
products_data = [
    {"id": f"{i}", "title": obj.title, "description": obj.description}
    for i, obj in enumerate(objects)
]
products_table.add(products_data)
products_table.create_fts_index("description", replace=True)
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}


class Reviews(LanceModel):
    id: str = func.SourceField()
    product_title: str = func.SourceField()
    product_description: str = func.SourceField()
    review: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


reviews_table = db.create_table("reviews", schema=Reviews, mode="overwrite")

reviews_with_product_id = [
    {
        "id": f"{i}",
        "product_title": review.product_title,
        "product_description": review.product_description,
        "review": review.review,
    }
    for i, review in enumerate(reviews)
]
reviews_table.add(reviews_with_product_id)
reviews_table.create_fts_index("review", replace=True)

In case you want to see the data quickly in a text editor, we also store the data in JSON.

In [4]:
with open("./reviews.json", "w") as f:
    json.dump([i.dict() for i in reviews], f)