In [2]:
import logging
import os
from datetime import datetime
from pathlib import Path

import openai
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
from rich import print
from tqdm import tqdm
from openai import OpenAI

client = OpenAI()

logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
df = pd.read_excel("Wedding Values.xlsx")
df.columns

Index(['City', 'Zip Code ', 'State', 'Country ', 'Email', 'Phone Number',
       'Price', 'Price Breakdown', 'Menu Breakdown', 'Bar Breakdown',
       'Groom and Bridal Set-Up', 'Ceremony Cost ', 'Guest Capacity ',
       'Outside Food', 'Outside Alcohol', 'Outside Dessert ',
       'Outside Wedding Coordinator', 'Outside Photographer ',
       'Package Approach', 'Pricing Transparency ', 'Reception or Ceremony',
       'Style', 'Indoor/Outdoor', 'Deposit and Payment Plans ', 'Privacy',
       'Accommodations ', 'Photography Score ', 'Environmental ',
       'What Time Does the Party Need to Stop', 'Late Night Eats ',
       'General Vibe', 'Top Choices ', 'Menu Choices '],
      dtype='object')

In [2]:
from wedding_venue_models import (
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
)

In [3]:
from pathlib import Path

path = Path(
    "/Users/mac-robertsocolewicz/Documents/private/playground_tables/test_md/Almansor Court.md"
)
venue_name = path.name.replace(path.suffix, "")

with open(path) as f:
    doc = f.read()

system_prompt = """
You are a helpful wedding AI assistant. Guide the user through understanding various options and pricing for the following wedding venue:

===
venue name: {venue_name}

venue description:

{doc}

===
"""

user_prompt = """
Please provide all the contact info for this wedding venue.
"""

completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {"role": "user", "content": user_prompt},
    ],
    response_format=WeddingContactInfo,
)

# math_reasoning = completion.choices[0].message

In [None]:
def generate_field_instructions(model_class: type[BaseModel]) -> str:
    """Generate field-specific instructions from a Pydantic model's docstrings."""
    instructions = []
    for field_name, field_info in model_class.model_fields.items():
        docstring = field_info.description or "No description provided."
        instructions.append(f"- {field_name}: {docstring}")
    return "\n".join(instructions)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    """Generate a system prompt based on the model class and its field descriptions."""
    field_instructions = generate_field_instructions(model_class)
    return f"""You are an expert in wedding planning. You are extracting structured information about wedding venues.

First, carefully analyze all relevant information in the text. Consider both explicit statements and reasonable inferences.

Important instructions:
1. For each field, follow the specific guidelines below about how to handle ambiguous or missing information.
2. For boolean fields, return true/false values rather than "Yes"/"No" strings.
3. For string fields, provide detailed information or null if not available.
4. For numerical fields, use -1 if information is not available.
5. Begin by developing a comprehensive reasoning that considers all evidence before determining individual field values.

Field-specific instructions:
{field_instructions}
"""


# Setup logging and OpenAI client
logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# List of all Pydantic models to process
models = [
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
]

# Process venues
venue_data = []
test_md_path = Path("test_md")
if not test_md_path.exists():
    print(f"Warning: {test_md_path} directory not found")
    exit()

md_files = list(test_md_path.glob("*.md"))
ai_model = "gpt-4.5-preview"

for file in tqdm(md_files[0:5], desc="Processing venues", unit="file"):
    tqdm.write(f"Processing: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}

    # Process each model for this venue
    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        try:
            completion = client.beta.chat.completions.parse(
                model=ai_model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": f"Extract venue information from this text about '{venue_name}':\n\n{md_content}",
                    },
                ],
                response_format=model_class,
                temperature=0,
            )
            venue_info = completion.choices[0].message.parsed.model_dump()
            # Prefix keys to avoid collisions between models
            prefixed_venue_info = {
                f"{model_class.__name__}_{k}": v for k, v in venue_info.items()
            }
            venue_dict.update(prefixed_venue_info)
            tqdm.write(
                f"✓ Successfully processed {model_class.__name__} for: {venue_name}"
            )
        except Exception as e:
            tqdm.write(
                f"✗ Error processing {model_class.__name__} for {venue_name}: {e}"
            )

    venue_data.append(venue_dict)

# Create and save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    print(f"\nProcessed {len(venue_data)} venues")
    print(df)

    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"venues_data_{now}.csv"
    df["model"] = ai_model
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
else:
    print("No venue data was processed")

In [60]:
from typing import Optional, Literal

TierName = Literal["Standard", "Signature", "Premium"]


class WeddingContactInfo(BaseModel):
    city: str | None
    """The city this wedding venue is located in."""
    state: str | None
    """The state this wedding venue is located in."""
    zip_code: str | None
    """The zip code this wedding venue is located in."""
    country: str | None
    """The country this wedding venue is located in."""
    email: str | None
    """The email address of the wedding venue."""
    phone: str | None
    """The phone number of the wedding venue."""
    website: str | None
    """The website of the wedding venue."""


class MenuTier(BaseModel):
    """
    You must return exactly three food tiers: Standard, Signature, and
    Premium.

    Venues often name their packages arbitrarily (e.g., "Adore", "Silver",
    "Classic", "Treasure"). Your task is to interpret these names and map each one
    to a standard tier by evaluating the **menu quality, service level, and
    pricing**.

    Use the following general guidance:

    - **Standard**: basic offerings, lower price point
    - **Signature**: moderate enhancements, mid-range price
    - **Premium**: multiple high-end appetizers, upgraded entrées (beef,
        seafood), top-tier service, highest price

    Ignore the original package names. Focus only on the **content and price range**
    to assign the correct standard tier name.
    """

    name: TierName
    """Tier name: must be Standard, Signature, or Premium."""
    appetizers: str | None
    """Appetizers in this tier."""
    entrees: str | None
    """Entrées in this tier."""
    sides: str | None
    """Side dishes in this tier."""
    desserts: str | None
    """Desserts in this tier."""
    beverages: str | None
    """Beverages in this tier."""
    price_range: list[int] | None
    """
    Price range (e.g., [60,70]) in USD per person. Estimate for a group of
    100 guests.
    """

    def to_string(self) -> str:
        parts = [f"{self.name}"]
        if self.appetizers:
            parts.append(f"- Appetizers: {self.appetizers}")
        if self.entrees:
            parts.append(f"- Entrées: {self.entrees}")
        if self.sides:
            parts.append(f"- Sides: {self.sides}")
        if self.desserts:
            parts.append(f"- Desserts: {self.desserts}")
        if self.beverages:
            parts.append(f"- Beverages: {self.beverages}")
        if self.price_range:
            if len(self.price_range) == 1:
                parts.append(f"- Price: ${self.price_range[0]}")
            else:
                parts.append(f"- Price: ${self.price_range[0]}-${self.price_range[1]}")
        if len(parts) == 1:
            return ""
        return "\n".join(parts)


class FoodBreakdown(BaseModel):
    """
    You must return exactly three food tiers: Standard, Signature, and
    Premium.

    Venues often name their packages arbitrarily (e.g., "Adore", "Silver",
    "Classic", "Treasure"). Your task is to interpret these names and map each one
    to a standard tier by evaluating the **menu quality, service level, and
    pricing**.

    Use the following general guidance:

    - **Standard**: basic offerings, lower price point
    - **Signature**: moderate enhancements, mid-range price
    - **Premium**: multiple high-end appetizers, upgraded entrées (beef,
        seafood), top-tier service, highest price

    Ignore the original package names. Focus only on the **content and price range**
    to assign the correct standard tier name.
    """

    tiers: list[MenuTier]
    """List of exactly 3 tiers: Standard (Silver), Signature (Gold), Premium (Platinum)."""

    def to_string(self) -> str:
        return "\n\n".join([tier.to_string() for tier in self.tiers])


class BarTier(BaseModel):
    name: TierName
    """Tier name: must be Standard, Signature, or Premium."""
    highlights: str | None
    """Beverages and package highlights in this drink package."""
    price_range: list[int] | None
    """
    Price range (e.g., [60,70]) in USD per person. Estimate for a group of
    100 guests.
    """
    bar_pricing_model: Literal["Open bar", "Hosted bar", "Cash bar", "Not Offered"]
    """
    - open bar: the venue provides the bar and the drinks, prepaid for by the
      host
    - hosted bar: the venue provides the bar and the drinks, paid for by the
      host at the end of the night, also referred to as a consumption bar or tab
      bar
    - cash bar: the venue provides the bar, but the drinks are payed for by the
      guests
    - not offered: the venue does not offer a bar or is not specified
    """

    def to_string(self) -> str:
        parts = [f"{self.name}"]
        if self.highlights:
            parts.append(f"- Highlights: {self.highlights}")
        if self.price_range:
            if len(self.price_range) == 1:
                parts.append(f"- Price: ${self.price_range[0]}")
            elif len(self.price_range) == 2:
                parts.append(f"- Price: ${self.price_range[0]}-${self.price_range[1]}")
        if len(parts) == 1:
            return ""
        return "\n".join(parts)


class BarBreakdown(BaseModel):
    """
    You must return exactly three bar tiers: Standard, Signature, and
    Premium.

    Many venues use unique names for their drink packages. Instead of
    copying those names, evaluate the **included alcohol types and
    service level**, and map them to a standard tier:

    - **Standard**: basic offerings, lower price point
    - **Signature**: mid-range price, includes house liquors or a soft
        bar with wine service
    - **Premium**: top-tier service, highest price like top-shelf
        liquor, signature cocktails, champagne toast, or full open bar

    Always normalize to these three tiers based on the drink offerings —
    not the label.
    """

    tiers: list[BarTier]
    """List of exactly 3 tiers: Standard (Silver), Signature (Gold), Premium (Platinum)."""

    extras: Optional[str] = Field(
        description="Extras like mocktails, signature cocktails, or batched drinks."
    )
    flexibility: Literal[
        "Completely fixed packages, no flexibility",
        "Fixed packages with a few extras or options",
        "Moderate or flexible approach",
        "Highly customizable with some structure",
        "Completely custom/DIY",
    ]
    """
    How much freedom does the customer have to customize the package?
    """

    def to_string(self) -> str:
        parts = []
        for tier in self.tiers:
            parts.append(tier.to_string())
        if self.extras:
            parts.append(f"Extras: {self.extras}")

        return "\n".join(parts)


class WeddingPriceInfo(BaseModel):
    price: int | None = Field(
        description="Give me your best guess for the cost per person it would be to have a wedding reception at this venue. Respond with only a number. Treat the costs associated with the ceremony as separate. If there are unknowns, use your best judgment to guesstimate the cost given the geographic location and relative 'niceness' of the property. If there are multiple packages and options, choose the middle or medium option to provide the best reference point. When you give the number,make sure you account for local taxes, tip, and any service charges. Assume that couples will pay for standard things like dinner, an appetizer course, dessert, alcohol, photography but exclude non-standard things like a late-night meal."
    )
    option: Literal["standard", "premium", "signature"]
    """
    This is the pricing option for this wedding venue. 
    """
    ceremony_cost: int | None = Field(
        description="Estimate a cost for the ceremony. Respond with a number only which would be your best guess based on the available information. Use -1 if no information is provided."
    )
    pricing_transparency: (
        Literal[
            "This venue discloses a small portion of the total wedding costs",
            "This venue discloses a moderate portion of the total wedding costs",
            "This venue discloses a high degree of the total costs",
            "Not enough information",
        ]
        | None
    )
    """
    Assess how much of the total wedding cost is disclosed in the provided
    materials. You MUST CHOOSE ONE of the following options that best matches
    the document's content and return the selected option's description as the
    field value:
        - This venue discloses a small portion of the total wedding
        costs. 
        - This venue discloses a moderate portion of the total wedding
        costs. 
        - This venue discloses a high degree of the total wedding costs 
        - Not enough information. 
    Guidance when selecting the option: A 'small portion' of disclosure means
    significant costs (e.g., food, bar/alcohol) are unclear or require
    contacting external vendors. A 'moderate portion' means some unknowns exist,
    but you can get a general cost idea without much extra work. A
   'high degree' means most costs are disclosed with few surprises, little
    additional work needed to understand the total cost."""

    deposit_and_payment_plans: (
        Literal[
            "The venue works with me on deposit terms and payment plans",
            "The venue does not have flexibility on deposit terms and payment plans",
            "Not enough information",
        ]
        | None
    )
    """
    Determine if the venue offers flexibility on deposit terms and payment
    plans. You MUST CHOOSE ONE of the following options that best matches the
    document's content and return the selected option's description as the field
    value:
    - The venue works with me on deposit terms and payment plans
    - The venue does not have flexibility on deposit terms and payment plans
    - Not enough information. 
    Follow these guidance when selecting the option: Flexibility means
    the venue allows negotiation on deposit amounts, payment schedules, or
    offers installment plans. Lack of flexibility is indicated by strict
    terms or no mention of flexible options."""

    def to_string(self) -> str:
        parts = []
        if self.price:
            parts.append(f"Price: ${self.price}")
        if self.option:
            parts.append(f"Option: {self.option}")
        return "\n".join(parts)


class WeddingVenuePricingSummary(BaseModel):
    options: list[WeddingPriceInfo]
    """
    This is a list of all the pricing options for this wedding venue.
    """
    pricing_transparency: (
        Literal[
            "This venue discloses a small portion of the total wedding costs",
            "This venue discloses a moderate portion of the total wedding costs",
            "This venue discloses a high degree of the total costs",
            "Not enough information",
        ]
        | None
    )
    """
    Assess how much of the total wedding cost is disclosed in the provided
    materials. You MUST CHOOSE ONE of the following options that best matches
    the document's content and return the selected option's description as the
    field value:

    - This venue discloses a small portion of the total wedding costs
    - This venue discloses a moderate portion of the total wedding costs
    - This venue discloses a high degree of the total wedding costs
    - Not enough information
    
    Guidance when selecting the option: A 'small portion' of disclosure means
    significant costs (e.g., food, bar/alcohol) are unclear or require
    contacting external vendors. A 'moderate portion' means some unknowns exist,
    but you can get a general cost idea without much extra work. A 'high degree'
    means most costs are disclosed with few surprises, little additional work
    needed to understand the total cost."""

    deposit_and_payment_plans: (
        Literal[
            "The venue works with me on deposit terms and payment plans",
            "The venue does not have flexibility on deposit terms and payment plans",
            "Not enough information",
        ]
        | None
    )
    """
    Determine if the venue offers flexibility on deposit terms and payment
    plans. You MUST CHOOSE ONE of the following options that best matches the
    document's content and return the selected option's description as the field
    value:
    
    - The venue works with me on deposit terms and payment plans 
    - The venue does not have flexibility on deposit terms and payment plans 
    - Not enough information. 
    
    Follow these guidance when selecting the option: Flexibility means the venue
    allows negotiation on deposit amounts, payment schedules, or offers
    installment plans. Lack of flexibility is indicated by strict terms or no
    mention of flexible options."""

    def to_string(self) -> str:
        parts = []
        if self.options:
            for option in self.options:
                parts.append(option.to_string())
        return "\n".join(parts)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    field_instructions = generate_field_instructions(model_class)
    tier_hint = model_class.__doc__

    return f"""
        You are an expert in wedding planning. You are extracting structured
        information about wedding venues.

        First, carefully analyze all relevant information in the text. Consider
        both explicit statements and reasonable inferences.

        Important instructions: 
        1. For each field, follow the specific guidelines below about how to handle 
           ambiguous or missing information.
        2. For boolean fields, return true/false values rather than "Yes"/"No" strings. 
        3. For string fields, provide detailed information or null if not available. 
        4. For numerical fields, use -1 if information is not available. 
        5. Begin by developing a comprehensive reasoning that considers all evidence 
           before determining individual field values.

        Field-specific instructions:
        {tier_hint}
        {field_instructions}
        """


In [61]:
# Setup
logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

models = [
    FoodBreakdown,
    BarBreakdown,
    WeddingContactInfo,
    WeddingVenuePricingSummary,
    WeddingVenueStyle,
]

venue_data = []
md_path = Path("test_md")
if not md_path.exists():
    raise FileNotFoundError("Directory 'test_md' not found")

md_files = list(md_path.glob("*.md"))[-1:]
ai_model = "gpt-4o-mini"

raw = []
for file in tqdm(md_files, desc="Processing venues", unit="file"):
    tqdm.write(f"Processing: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}

    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        try:
            completion = client.beta.chat.completions.parse(
                model=ai_model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": f"Extract venue information from this text about '{venue_name}':\n\n{md_content}",
                    },
                ],
                response_format=model_class,
                temperature=0,
            )
            obj = completion.choices[0].message.parsed
            raw.append(obj)
            if hasattr(obj, "to_string"):
                string_summary = obj.to_string()
                venue_dict[f"{model_class.__name__}_summary"] = string_summary
            else:
                venue_dict[f"{model_class.__name__}_summary"] = obj.model_dump()

            tqdm.write(f"✓ Processed {model_class.__name__} for: {venue_name}")

        except Exception as e:
            tqdm.write(f"✗ Error with {model_class.__name__} for {venue_name}: {e}")
            venue_dict[f"{model_class.__name__}_summary"] = None

    venue_data.append(venue_dict)

# Save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"food_bar_summary_{now}.csv"
    df.to_csv(output_path, index=False)
    print(f"✅ Summary saved to: {output_path}")
else:
    print("⚠️ No venue data processed.")

Processing venues:   0%|          | 0/1 [00:00<?, ?file/s]

Processing: 94th Aero Squadron Restaurant_.md


Processing venues:   0%|          | 0/1 [00:03<?, ?file/s]

✓ Processed FoodBreakdown for: 94th Aero Squadron Restaurant_


Processing venues:   0%|          | 0/1 [00:08<?, ?file/s]

✓ Processed BarBreakdown for: 94th Aero Squadron Restaurant_


Processing venues:   0%|          | 0/1 [00:10<?, ?file/s]

✓ Processed WeddingContactInfo for: 94th Aero Squadron Restaurant_


Processing venues:   0%|          | 0/1 [00:16<?, ?file/s]

✓ Processed WeddingVenuePricingSummary for: 94th Aero Squadron Restaurant_


Processing venues: 100%|██████████| 1/1 [00:18<00:00, 18.55s/file]

✓ Processed WeddingVenueStyle for: 94th Aero Squadron Restaurant_





In [62]:
raw

[FoodBreakdown(tiers=[MenuTier(name='Standard', appetizers='French Onion, Seafood Chowder, Tomato Basil Soup, Mushroom-Brie Bisque, 94th Caesar Salad, Mixed Garden Greens, Italian Pasta Salad, Red Potato Salad with Dill', entrees='Roasted Pork Loin, Oven Roasted Peppered Chicken, Tortellini Cassoulet, Chicken Caprese, Braised Beef Short Ribs, Mahi-Mahi with Lemon Caper, Lobster Spaghetti, New York Strip Steak, Herb Crusted Seabass, Crab Stuffed Jumbo Shell Pasta', sides='Red Bliss, Roasted Garlic Mashed Potatoes, Fresh Medley of Vegetables, Pesto Grains, Confit Potatoes', desserts=None, beverages=None, price_range=[70, 83]), MenuTier(name='Signature', appetizers=None, entrees=None, sides=None, desserts=None, beverages=None, price_range=None), MenuTier(name='Premium', appetizers=None, entrees=None, sides=None, desserts=None, beverages=None, price_range=None)]),
 BarBreakdown(tiers=[BarTier(name='Standard', highlights='Basic offerings with a focus on traditional buffet menu options.', pr

In [43]:
print(raw[0].to_string())

In [96]:
from typing import get_args

get_args(BarBreakdown.model_fields["flexibility"].annotation)

('Completely fixed packages, no flexibility',
 'Fixed packages with a few extras or options',
 'Moderate or flexible approach',
 'Highly customizable with some structure',
 'Completely custom/DIY')

In [None]:
"""
missing:
- ceremony cost
- outside wedding coordinator
- outside photographer
- package flexibility
- reception or ceremony
- what time does the party need to stop?
- top choices
- guest capacity
- late night food/snack


improve 
"""

In [97]:
from openpyxl.styles import Font, PatternFill

readable_columns = {
    "venue": "wedding venue",
    "WeddingContactInfo_city": "city",
    "WeddingContactInfo_state": "state",
    "WeddingContactInfo_country": "country",
    "WeddingContactInfo_zip_code": "zip code",
    "WeddingContactInfo_email": "email",
    "WeddingContactInfo_website": "website",
    "WeddingContactInfo_phone": "phone",
    "WeddingVenuePricingSummary_summary": "venue pricing summary",
    "FoodBreakdown_summary": "food breakdown",
    "BarBreakdown_summary": "bar breakdown",
    "BarBreakdown_extras": "bar extras",
    "BarBreakdown_flexibility": "bar flexibility",
    "WeddingVenuePricingSummary_pricing_transparency": "pricing transparency",
    "WeddingVenuePricingSummary_deposit_and_payment_plans": "deposit and payment plans",
    "WeddingVenueStyle_style": "style",
    "WeddingVenueStyle_indoor_outdoor": "indoor/outdoor",
    "WeddingVenueStyle_privacy": "privacy",
    "WeddingVenueStyle_accommodations": "accommodations",
    "WeddingVenueStyle_environmental": "environmental",
    "WeddingVenueStyle_general_vibe": "general vibe",
}


def flatten_dict(d: dict, parent_key: str = "", sep: str = "_") -> dict:
    """Flatten a nested dictionary by concatenating nested keys with a separator.

    Parameters
    ----------
    d : dict
        The dictionary to flatten
    parent_key : str, optional
        The parent key for nested dictionaries, by default ""
    sep : str, optional
        The separator to use between nested keys, by default "_"

    Returns
    -------
    dict
        A flattened dictionary with concatenated keys

    Examples
    --------
    >>> d = {"a": 1, "b": {"c": 2, "d": {"e": 3}}}
    >>> flatten_dict(d)
    {'a': 1, 'b_c': 2, 'b_d_e': 3}
    """
    items: list = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


class WeddingVenue:
    def __init__(self, venue_name: str, raw: list[BaseModel]):
        item_dict = {"venue": venue_name}
        for item in raw:
            obj_dict = item.model_dump()
            if "tiers" in obj_dict:
                obj_dict.pop("tiers")
                obj_dict["summary"] = item.to_string()

            if "options" in obj_dict:
                obj_dict.pop("options")
                obj_dict["summary"] = item.to_string()

            item_dict[item.__class__.__name__] = obj_dict

        self.df = pd.DataFrame()
        self.update(item_dict)

    def add_bar_flexibility(self) -> None:
        self.df["bar flexibility info"] = self.df["bar flexibility"]
        args = get_args(BarBreakdown.model_fields["flexibility"].annotation)
        self.df["bar flexibility"] = self.df["bar flexibility"].map(
            lambda x: 5 - args.index(x)
        )

    def update(self, d: dict) -> None:
        self.df = pd.DataFrame(flatten_dict(d), index=[0])
        self.rename_columns()
        self.add_bar_flexibility()

    def _repr_html_(self) -> str:
        return self.df._repr_html_()

    def rename_columns(self) -> None:
        """Rename and reorder columns based on readable_columns dictionary."""
        self.df.rename(columns=readable_columns, inplace=True)
        self.df.set_index("wedding venue", inplace=True)
        ordered_columns = [
            col for col in readable_columns.values() if col != "wedding venue"
        ]
        self.df = self.df.reindex(columns=ordered_columns)

    def to_excel(self, name: str = "wedding_venue.xlsx"):
        if not name.endswith(".xlsx"):
            name = f"{name}.xlsx"
        if os.path.exists(name):
            name = f"{name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"

        with pd.ExcelWriter(name, engine="openpyxl") as writer:
            self.df.to_excel(writer, sheet_name="Venue Options", index=False)

            worksheet = writer.sheets["Venue Options"]

            header_fill = PatternFill(
                start_color="B3E5FC", end_color="B3E5FC", fill_type="solid"
            )
            header_font = Font(bold=True)

            for col in range(1, len(df.columns) + 1):
                cell = worksheet.cell(row=1, column=col)
                cell.fill = header_fill
                cell.font = header_font

            for col in worksheet.columns:
                max_length = 0
                column = col[0].column_letter
                for cell in col:
                    if cell.value:
                        max_length = max(max_length, len(str(cell.value)))
                adjusted_width = max_length + 2
                worksheet.column_dimensions[column].width = min(adjusted_width, 50)

            worksheet.auto_filter.ref = worksheet.dimensions

        self.df.to_excel(writer, sheet_name="Venue Options", index=False)
        return self


In [98]:
venue = WeddingVenue(venue_name, raw)
venue.to_excel()

Unnamed: 0_level_0,food breakdown,bar extras,bar flexibility,bar breakdown,city,state,zip code,country,email,phone,...,pricing transparency,deposit and payment plans,venue pricing summary,style,indoor/outdoor,privacy,accommodations,environmental,general vibe,bar flexibility info
wedding venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94th Aero Squadron Restaurant_,"Standard\n- Appetizers: French Onion, Seafood ...",Custom menus may be designed upon request.,3,Standard\n- Highlights: Basic offerings with a...,San Diego,CA,92123,USA,info@94thsandiego.com,(858) 560-6771,...,Not enough information,The venue works with me on deposit terms and p...,Price: $77\nOption: standard\nPrice: $83\nOpti...,Restaurants,Not enough information,Moderate privacy with possible nearby non-wedd...,Not enough information,This venue does not emphasize environment or s...,Warm and cozy,Moderate or flexible approach


In [None]:
readable_columns = {
    "venue": "wedding venue",
    "FoodBreakdown_summary": "food breakdown",
    "BarBreakdown_extras": "bar extras",
    "BarBreakdown_flexibility": "bar flexibility",
    "BarBreakdown_summary": "bar breakdown",
    "WeddingContactInfo_city": "city",
    "WeddingContactInfo_state": "state",
    "WeddingContactInfo_zip_code": "zip code",
    "WeddingContactInfo_country": "country",
    "WeddingContactInfo_email": "email",
    "WeddingContactInfo_phone": "phone",
    "WeddingContactInfo_website": "website",
    "WeddingVenuePricingSummary_pricing_transparency": "pricing transparency",
    "WeddingVenuePricingSummary_deposit_and_payment_plans": "deposit and payment plans",
    "WeddingVenuePricingSummary_summary": "venue pricing summary",
    "WeddingVenueStyle_style": "style",
    "WeddingVenueStyle_indoor_outdoor": "indoor/outdoor",
    "WeddingVenueStyle_privacy": "privacy",
    "WeddingVenueStyle_accommodations": "accommodations",
    "WeddingVenueStyle_environmental": "environmental",
    "WeddingVenueStyle_general_vibe": "general vibe",
}

In [71]:
df.columns

Index(['venue', 'FoodBreakdown_summary', 'BarBreakdown_extras',
       'BarBreakdown_flexibility', 'BarBreakdown_summary',
       'WeddingContactInfo_city', 'WeddingContactInfo_state',
       'WeddingContactInfo_zip_code', 'WeddingContactInfo_country',
       'WeddingContactInfo_email', 'WeddingContactInfo_phone',
       'WeddingContactInfo_website',
       'WeddingVenuePricingSummary_pricing_transparency',
       'WeddingVenuePricingSummary_deposit_and_payment_plans',
       'WeddingVenuePricingSummary_summary', 'WeddingVenueStyle_style',
       'WeddingVenueStyle_indoor_outdoor', 'WeddingVenueStyle_privacy',
       'WeddingVenueStyle_accommodations', 'WeddingVenueStyle_environmental',
       'WeddingVenueStyle_general_vibe'],
      dtype='object')

In [27]:
print(df["FoodBreakdown_summary"].iloc[0])

In [None]:
import json
import logging
import os
from datetime import datetime
from pathlib import Path

import google.generativeai as genai
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# Setup detailed logging
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables
load_dotenv()
# # api_key = os.getenv("GOOGLE_API_KEY")
# if not api_key:
#     raise ValueError(
#         "GOOGLE_API_KEY not found in environment variables. Please set it in your .env file."
#     )

# Configure the Gemini API
api_key = "AIzaSyAnFuVtRCMOC7NMJHgAbfrW9wzDahMG6gY"
genai.configure(api_key=api_key)


# Define helper functions
def generate_field_instructions(model_class: type[BaseModel]) -> str:
    """Generate field-specific instructions from a Pydantic model's docstrings."""
    instructions = []
    for field_name, field_info in model_class.model_fields.items():
        docstring = field_info.description or "No description provided."
        # Add type hint to prompt for numeric fields
        if field_name == "guest_capacity":
            instructions.append(
                f"- {field_name}: {docstring} (Return as an integer: 1, 50, 100, 150, 200, or 300, or -1 if not available)"
            )
        else:
            instructions.append(f"- {field_name}: {docstring}")
    return "\n".join(instructions)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    """Generate a comprehensive prompt based on the model class and its field descriptions."""
    field_instructions = generate_field_instructions(model_class)
    return f"""You are an expert in wedding planning. You are extracting structured information about wedding venues.

First, carefully analyze all relevant information in the text. Consider both explicit statements and reasonable inferences.

Important instructions:
1. For each field, follow the specific guidelines below about how to handle ambiguous or missing information.
2. For boolean fields, return true/false values rather than "Yes"/"No" strings.
3. For string fields, provide detailed information or null if not available.
4. For numerical fields (e.g., guest_capacity), return integers or -1 if not available.
5. Begin by developing a comprehensive reasoning that considers all evidence before determining individual field values.
6. For fields with predefined choices (e.g., Literal types), you MUST return only the exact values listed in the options. If the information does not match any option, default to 'Not enough information' or the specified default value.
7. Return a valid JSON object with no additional text or commentary.

Field-specific instructions:
{field_instructions}

Now, extract the following venue information from the provided text:"""


# List of all Pydantic models to process
models = [
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
]

# Process venues
venue_data = []
test_md_path = Path("test_md")
if not test_md_path.exists():
    print(f"Warning: {test_md_path} directory not found")
    exit()

md_files = list(test_md_path.glob("*.md"))
ai_model = "gemini-2.0-flash-exp"  # Adjusted to a valid Gemini model

for file in tqdm(md_files[2:5], desc="Processing venues", unit="file"):
    logging.info(f"Processing file: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}
    logging.debug(f"Venue dict initialized for {venue_name}: {venue_dict}")

    # Process each model for this venue
    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        logging.debug(
            f"System prompt for {model_class.__name__}: {system_prompt[:500]}..."
        )  # Limit to first 500 chars
        try:
            # Use genai.GenerativeModel directly
            model = genai.GenerativeModel(model_name=ai_model)
            logging.debug(f"Generating content with model: {ai_model}")
            # Skip response_schema for WeddingVenueOther to avoid schema enforcement issues
            use_schema = model_class != WeddingVenueOther
            response = model.generate_content(
                contents=[
                    {
                        "role": "user",
                        "parts": [
                            f"{system_prompt}\n\nExtract venue information from this text about '{venue_name}':\n\n{md_content}"
                        ],
                    },
                ],
                generation_config=genai.types.GenerationConfig(
                    response_mime_type="application/json",
                    response_schema=model_class if use_schema else None,
                    temperature=0,
                ),
            )
            # Extract the raw response correctly
            if not response.candidates:
                raise ValueError(
                    f"No candidates returned in response for {model_class.__name__}"
                )
            raw_response = response.candidates[0].content.parts[0].text
            logging.debug(f"Raw response for {model_class.__name__}: {raw_response}")
            if model_class == WeddingVenueOther:
                print(f"Raw response for WeddingVenueOther: {raw_response}")
            # Attempt to parse JSON
            try:
                venue_info_dict = json.loads(raw_response)
                # Convert string numbers to integers for guest_capacity
                if (
                    model_class == WeddingVenueOther
                    and "guest_capacity" in venue_info_dict
                ):
                    if isinstance(venue_info_dict["guest_capacity"], str):
                        try:
                            venue_info_dict["guest_capacity"] = int(
                                venue_info_dict["guest_capacity"]
                            )
                        except ValueError:
                            venue_info_dict[
                                "guest_capacity"
                            ] = -1  # Fallback if conversion fails
            except json.JSONDecodeError as json_error:
                logging.error(
                    f"JSON Decode Error for {model_class.__name__}: {json_error}. Raw response: {raw_response}"
                )
                print(
                    f"✗ JSON Decode Error for {model_class.__name__} for {venue_name}: {json_error}. Raw response: {raw_response}"
                )
                venue_info_dict = {}
            # Manually validate with Pydantic for WeddingVenueOther
            if model_class == WeddingVenueOther:
                try:
                    venue_info_dict = model_class(**venue_info_dict).model_dump()
                except Exception as pydantic_error:
                    logging.error(
                        f"Pydantic Validation Error for {model_class.__name__}: {pydantic_error}. Parsed dict: {venue_info_dict}"
                    )
                    print(
                        f"✗ Pydantic Validation Error for {model_class.__name__} for {venue_name}: {pydantic_error}. Parsed dict: {venue_info_dict}"
                    )
                    venue_info_dict = {}
            # Prefix keys to avoid collisions between models
            prefixed_venue_info = {
                f"{model_class.__name__}_{k}": v for k, v in venue_info_dict.items()
            }
            venue_dict.update(prefixed_venue_info)
            logging.info(
                f"✓ Successfully processed {model_class.__name__} for: {venue_name}"
            )
        except Exception as e:
            logging.error(
                f"✗ Error processing {model_class.__name__} for {venue_name}: {e}"
            )
            print(f"✗ Error processing {model_class.__name__} for {venue_name}: {e}")
            # Skip WeddingVenueOther with placeholder if error persists
            if model_class == WeddingVenueOther:
                logging.warning(
                    f"Skipping WeddingVenueOther for {venue_name} due to error: {e}"
                )
                venue_dict.update({f"{model_class.__name__}_skipped": True})

    venue_data.append(venue_dict)
    logging.debug(f"Venue data appended: {venue_dict}")

# Create and save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    print(f"\nProcessed {len(venue_data)} venues")
    print(df)

    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"venues_data_{now}.csv"
    df["model"] = ai_model
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
else:
    print("No venue data was processed")

In [74]:
df_all = pd.concat([df_4o, df_4omini, df_preview, df_gemini])
df_all = (
    df_all.drop(columns=["WeddingVenueOther_skipped"])
    .reset_index()
    .drop(columns=["index"])
)

In [75]:
df_all.to_csv("final_demo_venues_data.csv")

In [28]:
df

Unnamed: 0,name,FoodBreakdown_summary,BarBreakdown_summary
0,a.o.c. Brentwood,Standard\n- Appetizers: bacon-wrapped dates st...,Standard: The Standard package includes a sele...
1,Aliso Viejo Country Club,Standard\n- Appetizers: One butler passed hors...,Standard: The 'Adore' package includes non-alc...
2,Agua Hedionda Nature Center,"Standard\n- Price: $3,250\n\nSignature\n- Pric...",Offers Hosted Bar: No\nBar Pricing Model: Not ...
3,Aliso Viejo Wedgewood,Standard\n- Entrées: 2-Entrée Buffet\n- Price:...,Standard: Cash Bar with Soft Drinks Included: ...
4,Alcazar Palm Springs,Standard\n\nSignature\n\nPremium\n- Price: $18...,
5,Almansor Court,Standard\n- Appetizers: Choice of Two Hors D'O...,Standard: The Silver Package includes a domest...
6,Ace Hotel Palm Springs,"Standard\n- Appetizers: Beet Falafel, Vegetabl...","Standard: Beer & Wine Bar includes two beers, ..."
7,94th Aero Squadron Restaurant_,"Standard\n- Appetizers: French Onion, Seafood ...",Offers Hosted Bar: No\nBar Pricing Model: Not ...


In [30]:
df.to_excel("final_demo_venues_data_p.xlsx", index=False)