In [23]:
from pathlib import Path

path = Path(
    "/Users/mac-robertsocolewicz/Documents/private/playground_tables/test_md/Almansor Court.md"
)
venue_name = path.name.replace(path.suffix, "")

with open(path) as f:
    doc = f.read()

system_prompt = f"""
You are a helpful wedding AI assistant. Guide the user through understanding various options and pricing for the following wedding venue:

===
venue name: {venue_name}

venue description:

{doc}

===
"""


[1]

In [43]:
import logging
import os
from pathlib import Path

from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm
from typing import Literal

logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)


class PriceInfo(BaseModel):
    option: Literal["deluxe", "premium", "standard", "economy", "other"]
    """
    This is the pricing option for this wedding venue. Please map
    silver/gold/platinum and basic/craft to these options. Use "other" only if
    necessary.
    """
    price_estimate: int
    """
    This is the price estimate per person for a group of 100 guests for this
    wedding venue.
    """
    highlights: str
    """
    A summary of the package option highlights. 
    """
    flexibility: Literal[
        "Completely fixed packages, no flexibility",
        "Fixed packages with a few extras or options",
        "Moderate or flexible approach",
        "Highly customizable with some structure",
        "Completely custom/DIY",
    ]
    """
    How much freedom does the customer have to customize the package?
    """


class WeddingPriceInfo(PriceInfo):
    pass


class BarPriceInfo(PriceInfo):
    hosted_bar: Literal["open bar", "hosted bar", "cash bar"]
    """
    The type of bar at this wedding venue:

    - open bar: the venue provides the bar and the drinks, prepaid for by the
      host
    - hosted bar: the venue provides the bar and the drinks, paid for by the
      host at the end of the night, also referred to as a consumption bar or tab
      bar
    - cash bar: the venue provides the bar, but the drinks are payed for by the
      guests
    """


class MenuPriceInfo(PriceInfo):
    highlights: str
    """
    A summary of the menu option highlights. Provide a concise summary of the
    menu options, including appetizers, main courses, etc. Please structure it
    in a nice way.
    """


class WeddingVenuePricingSummary(BaseModel):
    options: list[WeddingPriceInfo]
    """
    This is a list of all the pricing options for this wedding venue.
    """
    pricing_transparency: (
        Literal[
            "This venue discloses a small portion of the total wedding costs",
            "This venue discloses a moderate portion of the total wedding costs",
            "This venue discloses a high degree of the total costs",
            "Not enough information",
        ]
        | None
    )
    """
    Assess how much of the total wedding cost is disclosed in the provided
    materials. You MUST CHOOSE ONE of the following options that best matches
    the document's content and return the selected option's description as the
    field value:

    - This venue discloses a small portion of the total wedding costs
    - This venue discloses a moderate portion of the total wedding costs
    - This venue discloses a high degree of the total wedding costs
    - Not enough information
    
    Guidance when selecting the option: A 'small portion' of disclosure means
    significant costs (e.g., food, bar/alcohol) are unclear or require
    contacting external vendors. A 'moderate portion' means some unknowns exist,
    but you can get a general cost idea without much extra work. A 'high degree'
    means most costs are disclosed with few surprises, little additional work
    needed to understand the total cost."""

    deposit_and_payment_plans: (
        Literal[
            "The venue works with me on deposit terms and payment plans",
            "The venue does not have flexibility on deposit terms and payment plans",
            "Not enough information",
        ]
        | None
    )
    """
    Determine if the venue offers flexibility on deposit terms and payment
    plans. You MUST CHOOSE ONE of the following options that best matches the
    document's content and return the selected option's description as the field
    value:
    
    - The venue works with me on deposit terms and payment plans 
    - The venue does not have flexibility on deposit terms and payment plans 
    - Not enough information. 
    
    Follow these guidance when selecting the option: Flexibility means the venue
    allows negotiation on deposit amounts, payment schedules, or offers
    installment plans. Lack of flexibility is indicated by strict terms or no
    mention of flexible options."""


class BarPricingSummary(BaseModel):
    options: list[BarPriceInfo]
    """
    This is a list of bar-related pricing options for this wedding. Please only
    include pricing options that correspond to packages and menus, not
    individual prices. If there are more than 3 options, only include three that
    are diverse. Any fixed fees should be included as part of the price
    estimate.
    """


class MenuPricingSummary(BaseModel):
    options: list[MenuPriceInfo]
    """
    This is a list of all the menu and restaurant related pricing options for
    this wedding venue. Please only include pricing options that correspond to
    packages and menus, not individual prices.
    """


prompts_and_response_formats = [
    (
        """
        Please provide all the pricing info for this wedding venue.
        """,
        WeddingVenuePricingSummary,
    ),
    (
        """
        Please provide all the pricing info related to the bar for this wedding
        venue.
        """,
        BarPricingSummary,
    ),
    (
        """
        Please provide all the pricing info related to the menu/restaurant for
        this wedding venue.
        """,
        MenuPricingSummary,
    ),
]
completions = []
for prompt, response_format in tqdm(prompts_and_response_formats, desc="Processing"):
    tqdm.write(f"Processing: {prompt.strip()}", end="\r")

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": prompt},
        ],
        response_format=response_format,
    )
    completions.append(completion.choices[0].message.parsed)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Processing: Please provide all the pricing info for this wedding venue.

Processing:  33%|███▎      | 1/3 [00:04<00:08,  4.24s/it]

Processing: Please provide all the pricing info related to the bar for this wedding
        venue.

Processing:  67%|██████▋   | 2/3 [00:11<00:05,  5.89s/it]

Processing: Please provide all the pricing info related to the menu/restaurant for
        this wedding venue.

Processing: 100%|██████████| 3/3 [00:14<00:00,  4.73s/it]


In [44]:
for model in completions:
    print(model.__class__.__name__)
    print(model.model_dump_json(indent=2).replace("{", "").replace("}", ""))


In [48]:
{
    "wedding_venue": "Almansor Court",
    "wedding_price_info": completions[0].model_dump(),
    "menu_price_info": completions[2].model_dump(),
    "bar_price_info": completions[1].model_dump(),
}

{'wedding_venue': 'Almansor Court',
 'wedding_price_info': {'options': [{'option': 'standard',
    'price_estimate': 64,
    'highlights': 'Three Course Meal, Champagne & Apple Cider Toast, Custom Designed Cake, Unlimited Lemonade Service, Complimentary Parking.',
    'flexibility': 'Fixed packages with a few extras or options'},
   {'option': 'premium',
    'price_estimate': 84,
    'highlights': "Choice of Three Hors D'Oeuvres, Three Course Meal, 1 Hour Hosted Well Full Bar, Unlimited Soft Drinks & Juices, Complimentary Parking.",
    'flexibility': 'Moderate or flexible approach'},
   {'option': 'deluxe',
    'price_estimate': 93,
    'highlights': 'Almansor Buffet, Includes multiple entrées and sides for 1.5 Hours Buffet Service.',
    'flexibility': 'Moderate or flexible approach'},
   {'option': 'economy',
    'price_estimate': 50,
    'highlights': 'Basic package with essential amenities, suited for smaller gatherings or limited budgets.',
    'flexibility': 'Completely fixed pa

In [52]:
import pandas as pd
import json
import openpyxl
from openpyxl.styles import Font, Alignment, PatternFill


def flatten_nested_data(venue_data):
    """Process complex wedding venue data with multiple nested JSON fields"""
    # Make sure all JSON strings are parsed
    venue_dict = {}
    for key, value in venue_data.items():
        if isinstance(value, str) and (value.startswith("{") or value.startswith("[")):
            try:
                venue_dict[key] = json.loads(value)
            except json.JSONDecodeError:
                venue_dict[key] = value
        else:
            venue_dict[key] = value

    # Create a base row with venue information
    base_row = {"venue_name": venue_dict.get("wedding_venue", "")}

    all_rows = []

    # Process wedding price info if available
    if "wedding_price_info" in venue_dict:
        wedding_info = venue_dict["wedding_price_info"]
        if isinstance(wedding_info, dict) and "options" in wedding_info:
            for option in wedding_info["options"]:
                row = base_row.copy()
                row["price_category"] = "Wedding"
                row["package_name"] = option.get("option", "")
                row["price_estimate"] = option.get("price_estimate", "")
                row["highlights"] = option.get("highlights", "")
                row["flexibility"] = option.get("flexibility", "")
                all_rows.append(row)

    # Process menu price info if available
    if "menu_price_info" in venue_dict:
        menu_info = venue_dict["menu_price_info"]
        if isinstance(menu_info, dict) and "options" in menu_info:
            for option in menu_info["options"]:
                row = base_row.copy()
                row["price_category"] = "Menu"
                row["package_name"] = option.get("option", "")
                row["price_estimate"] = option.get("price_estimate", "")
                row["highlights"] = option.get("highlights", "")
                row["flexibility"] = option.get("flexibility", "")
                all_rows.append(row)

    # Process bar price info if available
    if "bar_price_info" in venue_dict:
        bar_info = venue_dict["bar_price_info"]
        if isinstance(bar_info, dict) and "options" in bar_info:
            for option in bar_info["options"]:
                row = base_row.copy()
                row["price_category"] = "Bar"
                row["package_name"] = option.get("option", "")
                row["price_estimate"] = option.get("price_estimate", "")
                row["highlights"] = option.get("highlights", "")
                row["flexibility"] = option.get("flexibility", "")
                all_rows.append(row)

    return all_rows


def export_to_excel(data_list, filename="wedding_venues_complex.xlsx"):
    """Export processed data to Excel with formatting"""
    # Create dataframe from list of dictionaries
    df = pd.DataFrame(data_list)

    # Export to Excel
    with pd.ExcelWriter(filename, engine="openpyxl") as writer:
        df.to_excel(writer, sheet_name="Venue Options", index=False)

        # Apply formatting
        workbook = writer.book
        worksheet = writer.sheets["Venue Options"]

        # Format header
        header_fill = PatternFill(
            start_color="B3E5FC", end_color="B3E5FC", fill_type="solid"
        )
        header_font = Font(bold=True)

        for col in range(1, len(df.columns) + 1):
            cell = worksheet.cell(row=1, column=col)
            cell.fill = header_fill
            cell.font = header_font

        # Auto-size columns
        for col in worksheet.columns:
            max_length = 0
            column = col[0].column_letter
            for cell in col:
                if cell.value:
                    max_length = max(max_length, len(str(cell.value)))
            adjusted_width = max_length + 2
            worksheet.column_dimensions[column].width = min(adjusted_width, 50)

        # Add autofilter
        worksheet.auto_filter.ref = worksheet.dimensions

        # Create another sheet with a pivot table view
        pivot_df = df.pivot_table(
            index=["venue_name", "package_name"],
            columns=["price_category"],
            values=["price_estimate"],
            aggfunc="first",
            fill_value="",
        ).reset_index()

        pivot_df.columns = [
            f"{col[0]}_{col[1]}" if col[1] else col[0] for col in pivot_df.columns
        ]
        pivot_df.rename(
            columns={
                "price_estimate_Bar": "bar_price",
                "price_estimate_Menu": "menu_price",
                "price_estimate_Wedding": "wedding_price",
            },
            inplace=True,
        )

        pivot_df.to_excel(writer, sheet_name="Package Summary", index=False)

        # Format pivot table sheet
        pivot_sheet = writer.sheets["Package Summary"]

        for col in range(1, len(pivot_df.columns) + 1):
            cell = pivot_sheet.cell(row=1, column=col)
            cell.fill = header_fill
            cell.font = header_font

        for col in pivot_sheet.columns:
            max_length = 0
            column = col[0].column_letter
            for cell in col:
                if cell.value:
                    max_length = max(max_length, len(str(cell.value)))
            adjusted_width = max_length + 2
            pivot_sheet.column_dimensions[column].width = min(adjusted_width, 50)

        pivot_sheet.auto_filter.ref = pivot_sheet.dimensions

    print(f"Data exported to {filename}")
    return filename


def process_multiple_venues(venues_list):
    """Process multiple wedding venues with complex data structure"""
    all_rows = []
    for venue_data in venues_list:
        rows = flatten_nested_data(venue_data)
        all_rows.extend(rows)
    return all_rows


# Example usage
sample_data = [
    {
        "wedding_venue": "Almansor Court",
        "wedding_price_info": completions[0].model_dump(),
        "menu_price_info": completions[2].model_dump(),
        "bar_price_info": completions[1].model_dump(),
    }
]

# Process single venue with complex data
processed_data = process_multiple_venues(sample_data)
export_to_excel(processed_data, "all.xlsx")


'all.xlsx'

In [42]:
import pandas as pd
import json
import openpyxl
from openpyxl.styles import Font, Alignment, PatternFill

# Sample data
wedding_data = {
    "wedding_venue": "Almansor Court",
    "menu_price_info": json.dumps(
        {
            "options": [
                {
                    "option": "standard",
                    "price_estimate": 64,
                    "highlights": "Three Course Meal, Champagne & Apple Cider Toast, Custom Designed Cake, Unlimited Lemonade Service, Complimentary Parking",
                    "flexibility": "Fixed packages with a few extras or options",
                },
                {
                    "option": "premium",
                    "price_estimate": 74,
                    "highlights": "Three Course Meal, Glass of House Wine, Domestic Keg of Beer, Unlimited Lemonade Service, Complimentary Parking",
                    "flexibility": "Fixed packages with a few extras or options",
                },
                {
                    "option": "deluxe",
                    "price_estimate": 84,
                    "highlights": "Three Course Meal, Glass of House Wine, 1-Hour Hosted Well Full Bar, Unlimited Soft Drinks, Gold or Silver Chiavari Chairs",
                    "flexibility": "Moderate or flexible approach",
                },
                {
                    "option": "economy",
                    "price_estimate": 59,
                    "highlights": "Buffet Service, Garden Green Salad, Oven Roasted Potatoes or Rice Pilaf, Various Entrées",
                    "flexibility": "Fixed packages with a few extras or options",
                },
            ]
        }
    ),
}


def process_venue_data(venue_data):
    """Process wedding venue data and convert to dataframe rows"""
    # Parse the menu price info JSON
    if isinstance(venue_data["menu_price_info"], str):
        menu_price_info = json.loads(venue_data["menu_price_info"])
    else:
        menu_price_info = venue_data["menu_price_info"]

    # Create rows for each option
    rows = []
    for option in menu_price_info["options"]:
        row = {
            "venue_name": venue_data["wedding_venue"],
            "package_name": option["option"],
            "price_per_person": option["price_estimate"],
            "highlights": option["highlights"],
            "flexibility": option["flexibility"],
        }
        rows.append(row)

    return rows


def export_to_excel(data_list, filename="wedding_venues.xlsx"):
    """Export processed data to Excel with formatting"""
    df = pd.DataFrame(data_list)

    with pd.ExcelWriter(filename, engine="openpyxl") as writer:
        df.to_excel(writer, sheet_name="Venue Options", index=False)

        workbook = writer.book
        worksheet = writer.sheets["Venue Options"]

        header_fill = PatternFill(
            start_color="B3E5FC", end_color="B3E5FC", fill_type="solid"
        )
        header_font = Font(bold=True)

        for col in range(1, len(df.columns) + 1):
            cell = worksheet.cell(row=1, column=col)
            cell.fill = header_fill
            cell.font = header_font

        for col in worksheet.columns:
            max_length = 0
            column = col[0].column_letter
            for cell in col:
                if cell.value:
                    max_length = max(max_length, len(str(cell.value)))
            adjusted_width = max_length + 2
            worksheet.column_dimensions[column].width = min(adjusted_width, 50)

        worksheet.auto_filter.ref = worksheet.dimensions

    print(f"Data exported to {filename}")
    return filename


processed_data = process_venue_data(wedding_data)

export_to_excel(processed_data)


def process_multiple_venues(venues_list):
    """Process multiple wedding venues"""
    all_rows = []
    for venue_data in venues_list:
        rows = process_venue_data(venue_data)
        all_rows.extend(rows)
    return all_rows


venues = [
    wedding_data,
    {
        "wedding_venue": "Another Venue",
        "menu_price_info": json.dumps(
            {
                "options": [
                    {
                        "option": "basic",
                        "price_estimate": 55,
                        "highlights": "Two Course Meal, Cash Bar",
                        "flexibility": "Fixed packages",
                    }
                ]
            }
        ),
    },
]

all_venue_data = process_multiple_venues(venues)
export_to_excel(all_venue_data, "all_wedding_venues.xlsx")

'all_wedding_venues.xlsx'

In [None]:
print(completions[0].model_dump_json(indent=2).replace("{", "").replace("}", ""))

In [20]:
print(as_markdown(completions))

In [32]:
completion.choices[0].message.parsed.options

[WeddingPriceInfo(option='standard', price=2500, price_breakdown=PriceBreakdown(base_prices='$2,500 for gazebo ceremony up to 150 guests, additional chairs at $2.00 each.', total_cost_for_assumed_guest_count='$2,500 for up to 150 guests on Sunday.', taxes_and_fees='22% hospitality fee plus 2% environmental fee; sales tax additional.', per_person_cost='Not applicable as pricing is based on packages not per person for ceremony.', inclusions='White garden chairs, house sound system, wireless microphone, and stand, tables with linens.', exclusions='Decorations, aisle runners, officiants, and music not included; all decorations must be arranged through preferred vendors.', assumptions='Ceremony for 150 guests.')),
 WeddingPriceInfo(option='premium', price=93, price_breakdown=PriceBreakdown(base_prices='$93 per person for the Platinum Buffet Dinner Package, dinner for 100 guests would be $9,300 before tax and fees.', total_cost_for_assumed_guest_count='$9,300 for 100 guests on dinner service

In [27]:
def generate_field_instructions(model_class: type[BaseModel]) -> str:
    """Generate field-specific instructions from a Pydantic model's docstrings."""
    instructions = []
    for field_name, field_info in model_class.model_fields.items():
        docstring = field_info.description or "No description provided."
        instructions.append(f"- {field_name}: {docstring}")
    return "\n".join(instructions)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    """Generate a system prompt based on the model class and its field descriptions."""
    field_instructions = generate_field_instructions(model_class)
    return f"""You are an expert in wedding planning. You are extracting structured information about wedding venues.

First, carefully analyze all relevant information in the text. Consider both explicit statements and reasonable inferences.

Important instructions:
1. For each field, follow the specific guidelines below about how to handle ambiguous or missing information.
2. For boolean fields, return true/false values rather than "Yes"/"No" strings.
3. For string fields, provide detailed information or null if not available.
4. For numerical fields, use -1 if information is not available.
5. Begin by developing a comprehensive reasoning that considers all evidence before determining individual field values.

Field-specific instructions:
{field_instructions}
"""


# Setup logging and OpenAI client
logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# List of all Pydantic models to process
models = [
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
]

# Process venues
venue_data = []
test_md_path = Path("test_md")
if not test_md_path.exists():
    print(f"Warning: {test_md_path} directory not found")
    exit()

md_files = list(test_md_path.glob("*.md"))
ai_model = "gpt-4.5-preview"

for file in tqdm(md_files[0:5], desc="Processing venues", unit="file"):
    tqdm.write(f"Processing: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}

    # Process each model for this venue
    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        try:
            completion = client.beta.chat.completions.parse(
                model=ai_model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": f"Extract venue information from this text about '{venue_name}':\n\n{md_content}",
                    },
                ],
                response_format=model_class,
                temperature=0,
            )
            venue_info = completion.choices[0].message.parsed.model_dump()
            # Prefix keys to avoid collisions between models
            prefixed_venue_info = {
                f"{model_class.__name__}_{k}": v for k, v in venue_info.items()
            }
            venue_dict.update(prefixed_venue_info)
            tqdm.write(
                f"✓ Successfully processed {model_class.__name__} for: {venue_name}"
            )
        except Exception as e:
            tqdm.write(
                f"✗ Error processing {model_class.__name__} for {venue_name}: {e}"
            )

    venue_data.append(venue_dict)

# Create and save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    print(f"\nProcessed {len(venue_data)} venues")
    print(df)

    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"venues_data_{now}.csv"
    df["model"] = ai_model
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
else:
    print("No venue data was processed")

Processing venues:   0%|          | 0/5 [00:00<?, ?file/s]/var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/ipykernel_25986/2078240712.py:4: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  for field_name, field_info in model_class.__fields__.items():


Processing: a.o.c. Brentwood.md


Processing venues:   0%|          | 0/5 [00:04<?, ?file/s]

✓ Successfully processed WeddingContactInfo for: a.o.c. Brentwood


Processing venues:   0%|          | 0/5 [00:14<?, ?file/s]

✓ Successfully processed WeddingPriceInfo for: a.o.c. Brentwood


Processing venues:   0%|          | 0/5 [00:20<?, ?file/s]

✓ Successfully processed WeddingVenueStyle for: a.o.c. Brentwood


Processing venues:   0%|          | 0/5 [00:27<?, ?file/s]

✓ Successfully processed WeddingVenueOther for: a.o.c. Brentwood


Processing venues:  20%|██        | 1/5 [00:54<03:38, 54.51s/file]

✓ Successfully processed WeddingFoodInfo for: a.o.c. Brentwood
Processing: Aliso Viejo Country Club.md


Processing venues:  20%|██        | 1/5 [00:59<03:38, 54.51s/file]

✓ Successfully processed WeddingContactInfo for: Aliso Viejo Country Club


Processing venues:  20%|██        | 1/5 [01:19<03:38, 54.51s/file]

✓ Successfully processed WeddingPriceInfo for: Aliso Viejo Country Club


Processing venues:  20%|██        | 1/5 [01:25<03:38, 54.51s/file]

✓ Successfully processed WeddingVenueStyle for: Aliso Viejo Country Club


Processing venues:  20%|██        | 1/5 [01:34<03:38, 54.51s/file]

✓ Successfully processed WeddingVenueOther for: Aliso Viejo Country Club


Processing venues:  40%|████      | 2/5 [02:10<03:21, 67.32s/file]

✓ Successfully processed WeddingFoodInfo for: Aliso Viejo Country Club
Processing: Agua Hedionda Nature Center.md


Processing venues:  40%|████      | 2/5 [02:13<03:21, 67.32s/file]

✓ Successfully processed WeddingContactInfo for: Agua Hedionda Nature Center


Processing venues:  40%|████      | 2/5 [02:30<03:21, 67.32s/file]

✓ Successfully processed WeddingPriceInfo for: Agua Hedionda Nature Center


Processing venues:  40%|████      | 2/5 [02:35<03:21, 67.32s/file]

✓ Successfully processed WeddingVenueStyle for: Agua Hedionda Nature Center


Processing venues:  40%|████      | 2/5 [02:40<03:21, 67.32s/file]

✓ Successfully processed WeddingVenueOther for: Agua Hedionda Nature Center


Processing venues:  60%|██████    | 3/5 [02:43<01:43, 51.67s/file]

✓ Successfully processed WeddingFoodInfo for: Agua Hedionda Nature Center
Processing: Aliso Viejo Wedgewood.md


Processing venues:  60%|██████    | 3/5 [02:47<01:43, 51.67s/file]

✓ Successfully processed WeddingContactInfo for: Aliso Viejo Wedgewood


Processing venues:  60%|██████    | 3/5 [03:06<01:43, 51.67s/file]

✓ Successfully processed WeddingPriceInfo for: Aliso Viejo Wedgewood


Processing venues:  60%|██████    | 3/5 [03:11<01:43, 51.67s/file]

✓ Successfully processed WeddingVenueStyle for: Aliso Viejo Wedgewood


Processing venues:  60%|██████    | 3/5 [03:16<01:43, 51.67s/file]

✓ Successfully processed WeddingVenueOther for: Aliso Viejo Wedgewood


Processing venues:  80%|████████  | 4/5 [03:27<00:48, 48.43s/file]

✓ Successfully processed WeddingFoodInfo for: Aliso Viejo Wedgewood
Processing: Alcazar Palm Springs.md


Processing venues:  80%|████████  | 4/5 [03:29<00:48, 48.43s/file]

✓ Successfully processed WeddingContactInfo for: Alcazar Palm Springs


Processing venues:  80%|████████  | 4/5 [03:46<00:48, 48.43s/file]

✓ Successfully processed WeddingPriceInfo for: Alcazar Palm Springs


Processing venues:  80%|████████  | 4/5 [03:50<00:48, 48.43s/file]

✓ Successfully processed WeddingVenueStyle for: Alcazar Palm Springs


Processing venues:  80%|████████  | 4/5 [03:55<00:48, 48.43s/file]

✓ Successfully processed WeddingVenueOther for: Alcazar Palm Springs


Processing venues: 100%|██████████| 5/5 [03:59<00:00, 47.93s/file]

✓ Successfully processed WeddingFoodInfo for: Alcazar Palm Springs





In [None]:
df_preview = df

In [20]:
df_4omini = pd.concat([df_old, df])
df_4omini

Unnamed: 0,name,WeddingContactInfo_city,WeddingContactInfo_state,WeddingContactInfo_zip_code,WeddingContactInfo_country,WeddingContactInfo_email,WeddingContactInfo_phone,WeddingContactInfo_website,WeddingPriceInfo_price,WeddingPriceInfo_price_breakdown,...,WeddingFoodInfo_outside_dessert_allowed,WeddingFoodInfo_kosher_food,WeddingFoodInfo_halal_food,WeddingFoodInfo_east_asian_food,WeddingFoodInfo_indian_food,WeddingFoodInfo_gluten_free_food,WeddingFoodInfo_other_ethnic_food_style,WeddingFoodInfo_late_night_food,WeddingFoodInfo_name,model
0,a.o.c. Brentwood,Los Angeles,CA,90049.0,USA,,,,250,"{'base_prices': '$20,000 Sunday for 80 guests,...",...,True,False,False,False,False,True,,False,a.o.c. Brentwood,gpt-4o-mini
0,Aliso Viejo Country Club,Aliso Viejo,California,-1.0,United States,,949.284.5460,alisogolf.com,115,{'base_prices': '$98 per person for Adore pack...,...,True,False,False,False,False,True,,False,Aliso Viejo Country Club,gpt-4o-mini
1,Agua Hedionda Nature Center,,,,,,,,-1,"{'base_prices': '$3,250 for Coastal Package, $...",...,True,False,False,False,False,False,,False,Agua Hedionda Nature Center,gpt-4o-mini
2,Aliso Viejo Wedgewood,Aliso Viejo,CA,92656.0,USA,Events@WedgewoodWeddings.com,866.966.3009,WedgewoodWeddings.com,128,{'base_prices': '$98 per person for Classic pa...,...,True,False,False,False,False,True,,False,Aliso Viejo Wedgewood,gpt-4o-mini
3,Alcazar Palm Springs,Palm Springs,California,-1.0,United States,Brittany@F10creative.com,-1,-1,245,"{'base_prices': '$8,000 non-refundable deposit...",...,True,False,False,False,False,False,,False,Alcazar Palm Springs,gpt-4o-mini


In [26]:
df_4o = pd.concat([df, df_old])
df_4o

Unnamed: 0,name,WeddingContactInfo_city,WeddingContactInfo_state,WeddingContactInfo_zip_code,WeddingContactInfo_country,WeddingContactInfo_email,WeddingContactInfo_phone,WeddingContactInfo_website,WeddingPriceInfo_price,WeddingPriceInfo_price_breakdown,...,WeddingFoodInfo_outside_dessert_allowed,WeddingFoodInfo_kosher_food,WeddingFoodInfo_halal_food,WeddingFoodInfo_east_asian_food,WeddingFoodInfo_indian_food,WeddingFoodInfo_gluten_free_food,WeddingFoodInfo_other_ethnic_food_style,WeddingFoodInfo_late_night_food,WeddingFoodInfo_name,model
0,a.o.c. Brentwood,Los Angeles,CA,90049.0,USA,,,,325,"{'base_prices': '$20,000 Sunday for 80 guests,...",...,True,False,False,False,False,True,,False,a.o.c. Brentwood,gpt-4o
0,Aliso Viejo Country Club,Aliso Viejo,California,,USA,,949.284.5460,alisogolf.com,165,{'base_prices': '$98 per person for Adore pack...,...,True,False,False,True,True,True,"Mediterranean, Cuban",False,Aliso Viejo Country Club,gpt-4o
1,Agua Hedionda Nature Center,,,,,,,,150,"{'base_prices': '$3,250 for Coastal Package, $...",...,True,False,False,False,False,False,,False,Agua Hedionda Nature Center,gpt-4o
2,Aliso Viejo Wedgewood,Aliso Viejo,CA,92656.0,USA,Events@WedgewoodWeddings.com,866.966.3009,WedgewoodWeddings.com,128,"{'base_prices': '$2,995 for Friday, Sunday, Sa...",...,True,False,False,False,False,False,,False,Aliso Viejo Wedgewood,gpt-4o
3,Alcazar Palm Springs,Palm Springs,California,,USA,Brittany@F10creative.com,,,450,"{'base_prices': '$8,000 venue fee, $18,000 foo...",...,True,False,False,False,False,False,,False,Alcazar Palm Springs,gpt-4o


In [38]:
df_4omini.to_csv("df_4omini.csv")

In [None]:
import json
import logging
import os
from datetime import datetime
from pathlib import Path

import google.generativeai as genai
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# Setup detailed logging
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables
load_dotenv()
# # api_key = os.getenv("GOOGLE_API_KEY")
# if not api_key:
#     raise ValueError(
#         "GOOGLE_API_KEY not found in environment variables. Please set it in your .env file."
#     )

# Configure the Gemini API
api_key = "AIzaSyAnFuVtRCMOC7NMJHgAbfrW9wzDahMG6gY"
genai.configure(api_key=api_key)


# Define helper functions
def generate_field_instructions(model_class: type[BaseModel]) -> str:
    """Generate field-specific instructions from a Pydantic model's docstrings."""
    instructions = []
    for field_name, field_info in model_class.model_fields.items():
        docstring = field_info.description or "No description provided."
        # Add type hint to prompt for numeric fields
        if field_name == "guest_capacity":
            instructions.append(
                f"- {field_name}: {docstring} (Return as an integer: 1, 50, 100, 150, 200, or 300, or -1 if not available)"
            )
        else:
            instructions.append(f"- {field_name}: {docstring}")
    return "\n".join(instructions)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    """Generate a comprehensive prompt based on the model class and its field descriptions."""
    field_instructions = generate_field_instructions(model_class)
    return f"""You are an expert in wedding planning. You are extracting structured information about wedding venues.

First, carefully analyze all relevant information in the text. Consider both explicit statements and reasonable inferences.

Important instructions:
1. For each field, follow the specific guidelines below about how to handle ambiguous or missing information.
2. For boolean fields, return true/false values rather than "Yes"/"No" strings.
3. For string fields, provide detailed information or null if not available.
4. For numerical fields (e.g., guest_capacity), return integers or -1 if not available.
5. Begin by developing a comprehensive reasoning that considers all evidence before determining individual field values.
6. For fields with predefined choices (e.g., Literal types), you MUST return only the exact values listed in the options. If the information does not match any option, default to 'Not enough information' or the specified default value.
7. Return a valid JSON object with no additional text or commentary.

Field-specific instructions:
{field_instructions}

Now, extract the following venue information from the provided text:"""


# List of all Pydantic models to process
models = [
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
]

# Process venues
venue_data = []
test_md_path = Path("test_md")
if not test_md_path.exists():
    print(f"Warning: {test_md_path} directory not found")
    exit()

md_files = list(test_md_path.glob("*.md"))
ai_model = "gemini-2.0-flash-exp"  # Adjusted to a valid Gemini model

for file in tqdm(md_files[2:5], desc="Processing venues", unit="file"):
    logging.info(f"Processing file: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}
    logging.debug(f"Venue dict initialized for {venue_name}: {venue_dict}")

    # Process each model for this venue
    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        logging.debug(
            f"System prompt for {model_class.__name__}: {system_prompt[:500]}..."
        )  # Limit to first 500 chars
        try:
            # Use genai.GenerativeModel directly
            model = genai.GenerativeModel(model_name=ai_model)
            logging.debug(f"Generating content with model: {ai_model}")
            # Skip response_schema for WeddingVenueOther to avoid schema enforcement issues
            use_schema = model_class != WeddingVenueOther
            response = model.generate_content(
                contents=[
                    {
                        "role": "user",
                        "parts": [
                            f"{system_prompt}\n\nExtract venue information from this text about '{venue_name}':\n\n{md_content}"
                        ],
                    },
                ],
                generation_config=genai.types.GenerationConfig(
                    response_mime_type="application/json",
                    response_schema=model_class if use_schema else None,
                    temperature=0,
                ),
            )
            # Extract the raw response correctly
            if not response.candidates:
                raise ValueError(
                    f"No candidates returned in response for {model_class.__name__}"
                )
            raw_response = response.candidates[0].content.parts[0].text
            logging.debug(f"Raw response for {model_class.__name__}: {raw_response}")
            if model_class == WeddingVenueOther:
                print(f"Raw response for WeddingVenueOther: {raw_response}")
            # Attempt to parse JSON
            try:
                venue_info_dict = json.loads(raw_response)
                # Convert string numbers to integers for guest_capacity
                if (
                    model_class == WeddingVenueOther
                    and "guest_capacity" in venue_info_dict
                ):
                    if isinstance(venue_info_dict["guest_capacity"], str):
                        try:
                            venue_info_dict["guest_capacity"] = int(
                                venue_info_dict["guest_capacity"]
                            )
                        except ValueError:
                            venue_info_dict[
                                "guest_capacity"
                            ] = -1  # Fallback if conversion fails
            except json.JSONDecodeError as json_error:
                logging.error(
                    f"JSON Decode Error for {model_class.__name__}: {json_error}. Raw response: {raw_response}"
                )
                print(
                    f"✗ JSON Decode Error for {model_class.__name__} for {venue_name}: {json_error}. Raw response: {raw_response}"
                )
                venue_info_dict = {}
            # Manually validate with Pydantic for WeddingVenueOther
            if model_class == WeddingVenueOther:
                try:
                    venue_info_dict = model_class(**venue_info_dict).model_dump()
                except Exception as pydantic_error:
                    logging.error(
                        f"Pydantic Validation Error for {model_class.__name__}: {pydantic_error}. Parsed dict: {venue_info_dict}"
                    )
                    print(
                        f"✗ Pydantic Validation Error for {model_class.__name__} for {venue_name}: {pydantic_error}. Parsed dict: {venue_info_dict}"
                    )
                    venue_info_dict = {}
            # Prefix keys to avoid collisions between models
            prefixed_venue_info = {
                f"{model_class.__name__}_{k}": v for k, v in venue_info_dict.items()
            }
            venue_dict.update(prefixed_venue_info)
            logging.info(
                f"✓ Successfully processed {model_class.__name__} for: {venue_name}"
            )
        except Exception as e:
            logging.error(
                f"✗ Error processing {model_class.__name__} for {venue_name}: {e}"
            )
            print(f"✗ Error processing {model_class.__name__} for {venue_name}: {e}")
            # Skip WeddingVenueOther with placeholder if error persists
            if model_class == WeddingVenueOther:
                logging.warning(
                    f"Skipping WeddingVenueOther for {venue_name} due to error: {e}"
                )
                venue_dict.update({f"{model_class.__name__}_skipped": True})

    venue_data.append(venue_dict)
    logging.debug(f"Venue data appended: {venue_dict}")

# Create and save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    print(f"\nProcessed {len(venue_data)} venues")
    print(df)

    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"venues_data_{now}.csv"
    df["model"] = ai_model
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
else:
    print("No venue data was processed")

In [74]:
df_all = pd.concat([df_4o, df_4omini, df_preview, df_gemini])
df_all = (
    df_all.drop(columns=["WeddingVenueOther_skipped"])
    .reset_index()
    .drop(columns=["index"])
)

In [75]:
df_all.to_csv("final_demo_venues_data.csv")