# Context

This course includes example code for most topics. The example code relates to building a feature that answers user questions based on previous product reviews for a hardware e-commerce website.

This notebook builds the raw product description and reviews data.

When you go apply course lessons in your business, you will not have an equivalent of this notebook. Because the data you retrieve from your database to answer questions will be real data from your business.

In [1]:
import asyncio
from typing import List, Dict
import instructor
import json
import lancedb
import numpy as np
import pandas as pd
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel

client = instructor.from_openai(OpenAI())


class Product(BaseModel):
    title: str
    description: str


reviews_per_product = 8


def generate_physical_objects(n_objects=50) -> List[Product]:
    prompt = (
        f"Create a list of {n_objects} products someone might buy at a hardware store"
    )
    prompt += "The products should be varied. Do not have duplicates.\n"
    prompt += (
        "Products can be small (a screw), large (a bandsaw) or anywhere in between.\n"
    )
    prompt += "For each product, write a 2-3 sentence product description that might show up in a hardware retailers website underneath the product"
    prompt += "Do not create product reviews that contradict specific facts in other reviews. "
    prompt += "Contradicting subjective opinions in other reviews is ok only to the extent you would expect that in real data."
    prompt += "Respond only with the list of products and descriptions."

    try:
        objects = client.chat.completions.create(
            model="gpt-4o",
            response_model=List[Product],
            messages=[{"role": "user", "content": prompt}],
        )
        return objects
    except Exception as e:
        print(f"Error generating evals: {str(e)}")
        return []


objects = generate_physical_objects()
deduplicated_objects = []
seen_titles = set()
for obj in objects:
    if obj.title not in seen_titles:
        deduplicated_objects.append(obj)
        seen_titles.add(obj.title)

print(f"Created {len(deduplicated_objects)} unique objects")
print(f"First 10 objects: {deduplicated_objects[:10]}")

Created 48 unique objects
First 10 objects: [Product(title='Hammer', description='A versatile claw hammer for general carpentry and home repair. Features an ergonomic grip and balanced weight for efficient and comfortable use.'), Product(title='Cordless Drill', description='A powerful cordless drill with a rechargeable lithium-ion battery. Includes multiple drill bits and adjustable speed settings for various tasks.'), Product(title='Adjustable Wrench', description='A durable adjustable wrench with a non-slip handle. Perfect for tightening or loosening nuts and bolts of different sizes.'), Product(title='Screwdriver Set', description='A comprehensive screwdriver set that includes flathead, Phillips, and Torx drivers. Made from high-quality steel for long-lasting performance.'), Product(title='Tape Measure', description='A 25-foot tape measure with a lockable blade and easy-to-read markings. Ideal for accurate measurements in construction and DIY projects.'), Product(title='Circular Saw

We have created the list of objects. Now we will create the product reviews.

In [2]:
# Patch the AsyncOpenAI client
async_client = instructor.from_openai(AsyncOpenAI())


class Review(BaseModel):
    review: str


class AllObjectInfo(BaseModel):
    title: str
    description: str
    review: str


async def make_reviews(
    product: Product, n: int, semaphore: asyncio.Semaphore = asyncio.Semaphore(1)
) -> List[AllObjectInfo]:
    async with semaphore:
        prompt = f"""
        Write {n} realistic but detailed/specific product reviews that might show up on a hardware store's website.

        The reviews should be about the following product:
        Product Title: {product.title}
        Product Description: {product.description}
        
        Add many relevant and concrete facts about the products (this is for synthetic data generation, make up facts about each product as necessary).

        To see the format of a possible review, here is a review for a saw:
        ```
        I've enjoyed using this saw. It is lightweight and the battery lasts longer than other brands.
        I've been using it for 3 years now and it has been very durable. It was twice as expensive as the PX-500. But
        it is comfortable to hold because of the light weight.
        ```

        Respond only with the reviews, and nothing else.
        """

        try:
            result = await async_client.chat.completions.create(
                model="gpt-4o",
                response_model=List[Review],
                messages=[{"role": "user", "content": prompt}],
            )
            return [
                AllObjectInfo(
                    title=product.title,
                    description=product.description,
                    review=r.review,
                )
                for r in result
            ]

        except Exception as e:
            print(f"Error generating FreeCAD code: {str(e)}")
            return []


async def create_synthetic_reviews(
    max_concurrency: int = 20, reviews_per_product: int = reviews_per_product
) -> List[AllObjectInfo]:
    out = []
    semaphore = asyncio.Semaphore(max_concurrency)
    tasks = [
        make_reviews(o, reviews_per_product, semaphore) for o in deduplicated_objects
    ]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    for r in results:
        if not isinstance(r, Exception):
            out.extend(r)
    return out


reviews = await create_synthetic_reviews()

Store the items to be retrieved in LanceDB

In [3]:
db = lancedb.connect("./lancedb")
func = get_registry().get("openai").create(name="text-embedding-3-small")


class Products(LanceModel):
    id: str = func.SourceField()
    title: str = func.SourceField()
    description: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


products_table = db.create_table("products", schema=Products, mode="overwrite")
products_data = [
    {"id": f"{i}", "title": obj.title, "description": obj.description}
    for i, obj in enumerate(deduplicated_objects)
]
products_table.add(products_data)
products_table.create_fts_index("description", replace=True)
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}


class Reviews(LanceModel):
    id: str = func.SourceField()
    product_id: str = func.SourceField()
    review: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


reviews_table = db.create_table("reviews", schema=Reviews, mode="overwrite")

reviews_with_product_id = [
    {"id": f"{i}", "product_id": product_id_map[review.title], "review": review.review}
    for i, review in enumerate(reviews)
]
reviews_table.add(reviews_with_product_id)
reviews_table.create_fts_index("review", replace=True)

For simplicity of some other examples, we will also store the data in JSON.

In [4]:
with open("./reviews.json", "w") as f:
    json.dump([i.dict() for i in reviews], f)