In [None]:
!pip install -qq google-generativeai

In [None]:
import logging
import os
from datetime import datetime
from pathlib import Path

import openai
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
from rich import print
from tqdm import tqdm

logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
df = pd.read_excel("Wedding Values.xlsx")
df.columns

Index(['City', 'Zip Code ', 'State', 'Country ', 'Email', 'Phone Number',
       'Price', 'Price Breakdown', 'Menu Breakdown', 'Bar Breakdown',
       'Groom and Bridal Set-Up', 'Ceremony Cost ', 'Guest Capacity ',
       'Outside Food', 'Outside Alcohol', 'Outside Dessert ',
       'Outside Wedding Coordinator', 'Outside Photographer ',
       'Package Approach', 'Pricing Transparency ', 'Reception or Ceremony',
       'Style', 'Indoor/Outdoor', 'Deposit and Payment Plans ', 'Privacy',
       'Accommodations ', 'Photography Score ', 'Environmental ',
       'What Time Does the Party Need to Stop', 'Late Night Eats ',
       'General Vibe', 'Top Choices ', 'Menu Choices '],
      dtype='object')

In [2]:
from wedding_venue_models import *

In [27]:
def generate_field_instructions(model_class: type[BaseModel]) -> str:
    """Generate field-specific instructions from a Pydantic model's docstrings."""
    instructions = []
    for field_name, field_info in model_class.__fields__.items():
        docstring = field_info.description or "No description provided."
        instructions.append(f"- {field_name}: {docstring}")
    return "\n".join(instructions)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    """Generate a system prompt based on the model class and its field descriptions."""
    field_instructions = generate_field_instructions(model_class)
    return f"""You are an expert in wedding planning. You are extracting structured information about wedding venues.

First, carefully analyze all relevant information in the text. Consider both explicit statements and reasonable inferences.

Important instructions:
1. For each field, follow the specific guidelines below about how to handle ambiguous or missing information.
2. For boolean fields, return true/false values rather than "Yes"/"No" strings.
3. For string fields, provide detailed information or null if not available.
4. For numerical fields, use -1 if information is not available.
5. Begin by developing a comprehensive reasoning that considers all evidence before determining individual field values.

Field-specific instructions:
{field_instructions}
"""


# Setup logging and OpenAI client
logging.basicConfig(level=logging.WARNING)
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# List of all Pydantic models to process
models = [
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
]

# Process venues
venue_data = []
test_md_path = Path("test_md")
if not test_md_path.exists():
    print(f"Warning: {test_md_path} directory not found")
    exit()

md_files = list(test_md_path.glob("*.md"))
ai_model = "gpt-4.5-preview"

for file in tqdm(md_files[0:5], desc="Processing venues", unit="file"):
    tqdm.write(f"Processing: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}

    # Process each model for this venue
    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        try:
            completion = client.beta.chat.completions.parse(
                model=ai_model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": f"Extract venue information from this text about '{venue_name}':\n\n{md_content}",
                    },
                ],
                response_format=model_class,
                temperature=0,
            )
            venue_info = completion.choices[0].message.parsed.model_dump()
            # Prefix keys to avoid collisions between models
            prefixed_venue_info = {
                f"{model_class.__name__}_{k}": v for k, v in venue_info.items()
            }
            venue_dict.update(prefixed_venue_info)
            tqdm.write(
                f"✓ Successfully processed {model_class.__name__} for: {venue_name}"
            )
        except Exception as e:
            tqdm.write(
                f"✗ Error processing {model_class.__name__} for {venue_name}: {e}"
            )

    venue_data.append(venue_dict)

# Create and save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    print(f"\nProcessed {len(venue_data)} venues")
    print(df)

    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"venues_data_{now}.csv"
    df["model"] = ai_model
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
else:
    print("No venue data was processed")

Processing venues:   0%|          | 0/5 [00:00<?, ?file/s]/var/folders/ny/x98qxh1124dbtz5g3f6s09hh0000gn/T/ipykernel_25986/2078240712.py:4: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  for field_name, field_info in model_class.__fields__.items():


Processing: a.o.c. Brentwood.md


Processing venues:   0%|          | 0/5 [00:04<?, ?file/s]

✓ Successfully processed WeddingContactInfo for: a.o.c. Brentwood


Processing venues:   0%|          | 0/5 [00:14<?, ?file/s]

✓ Successfully processed WeddingPriceInfo for: a.o.c. Brentwood


Processing venues:   0%|          | 0/5 [00:20<?, ?file/s]

✓ Successfully processed WeddingVenueStyle for: a.o.c. Brentwood


Processing venues:   0%|          | 0/5 [00:27<?, ?file/s]

✓ Successfully processed WeddingVenueOther for: a.o.c. Brentwood


Processing venues:  20%|██        | 1/5 [00:54<03:38, 54.51s/file]

✓ Successfully processed WeddingFoodInfo for: a.o.c. Brentwood
Processing: Aliso Viejo Country Club.md


Processing venues:  20%|██        | 1/5 [00:59<03:38, 54.51s/file]

✓ Successfully processed WeddingContactInfo for: Aliso Viejo Country Club


Processing venues:  20%|██        | 1/5 [01:19<03:38, 54.51s/file]

✓ Successfully processed WeddingPriceInfo for: Aliso Viejo Country Club


Processing venues:  20%|██        | 1/5 [01:25<03:38, 54.51s/file]

✓ Successfully processed WeddingVenueStyle for: Aliso Viejo Country Club


Processing venues:  20%|██        | 1/5 [01:34<03:38, 54.51s/file]

✓ Successfully processed WeddingVenueOther for: Aliso Viejo Country Club


Processing venues:  40%|████      | 2/5 [02:10<03:21, 67.32s/file]

✓ Successfully processed WeddingFoodInfo for: Aliso Viejo Country Club
Processing: Agua Hedionda Nature Center.md


Processing venues:  40%|████      | 2/5 [02:13<03:21, 67.32s/file]

✓ Successfully processed WeddingContactInfo for: Agua Hedionda Nature Center


Processing venues:  40%|████      | 2/5 [02:30<03:21, 67.32s/file]

✓ Successfully processed WeddingPriceInfo for: Agua Hedionda Nature Center


Processing venues:  40%|████      | 2/5 [02:35<03:21, 67.32s/file]

✓ Successfully processed WeddingVenueStyle for: Agua Hedionda Nature Center


Processing venues:  40%|████      | 2/5 [02:40<03:21, 67.32s/file]

✓ Successfully processed WeddingVenueOther for: Agua Hedionda Nature Center


Processing venues:  60%|██████    | 3/5 [02:43<01:43, 51.67s/file]

✓ Successfully processed WeddingFoodInfo for: Agua Hedionda Nature Center
Processing: Aliso Viejo Wedgewood.md


Processing venues:  60%|██████    | 3/5 [02:47<01:43, 51.67s/file]

✓ Successfully processed WeddingContactInfo for: Aliso Viejo Wedgewood


Processing venues:  60%|██████    | 3/5 [03:06<01:43, 51.67s/file]

✓ Successfully processed WeddingPriceInfo for: Aliso Viejo Wedgewood


Processing venues:  60%|██████    | 3/5 [03:11<01:43, 51.67s/file]

✓ Successfully processed WeddingVenueStyle for: Aliso Viejo Wedgewood


Processing venues:  60%|██████    | 3/5 [03:16<01:43, 51.67s/file]

✓ Successfully processed WeddingVenueOther for: Aliso Viejo Wedgewood


Processing venues:  80%|████████  | 4/5 [03:27<00:48, 48.43s/file]

✓ Successfully processed WeddingFoodInfo for: Aliso Viejo Wedgewood
Processing: Alcazar Palm Springs.md


Processing venues:  80%|████████  | 4/5 [03:29<00:48, 48.43s/file]

✓ Successfully processed WeddingContactInfo for: Alcazar Palm Springs


Processing venues:  80%|████████  | 4/5 [03:46<00:48, 48.43s/file]

✓ Successfully processed WeddingPriceInfo for: Alcazar Palm Springs


Processing venues:  80%|████████  | 4/5 [03:50<00:48, 48.43s/file]

✓ Successfully processed WeddingVenueStyle for: Alcazar Palm Springs


Processing venues:  80%|████████  | 4/5 [03:55<00:48, 48.43s/file]

✓ Successfully processed WeddingVenueOther for: Alcazar Palm Springs


Processing venues: 100%|██████████| 5/5 [03:59<00:00, 47.93s/file]

✓ Successfully processed WeddingFoodInfo for: Alcazar Palm Springs





In [None]:
df_preview = df

In [20]:
df_4omini = pd.concat([df_old, df])
df_4omini

Unnamed: 0,name,WeddingContactInfo_city,WeddingContactInfo_state,WeddingContactInfo_zip_code,WeddingContactInfo_country,WeddingContactInfo_email,WeddingContactInfo_phone,WeddingContactInfo_website,WeddingPriceInfo_price,WeddingPriceInfo_price_breakdown,...,WeddingFoodInfo_outside_dessert_allowed,WeddingFoodInfo_kosher_food,WeddingFoodInfo_halal_food,WeddingFoodInfo_east_asian_food,WeddingFoodInfo_indian_food,WeddingFoodInfo_gluten_free_food,WeddingFoodInfo_other_ethnic_food_style,WeddingFoodInfo_late_night_food,WeddingFoodInfo_name,model
0,a.o.c. Brentwood,Los Angeles,CA,90049.0,USA,,,,250,"{'base_prices': '$20,000 Sunday for 80 guests,...",...,True,False,False,False,False,True,,False,a.o.c. Brentwood,gpt-4o-mini
0,Aliso Viejo Country Club,Aliso Viejo,California,-1.0,United States,,949.284.5460,alisogolf.com,115,{'base_prices': '$98 per person for Adore pack...,...,True,False,False,False,False,True,,False,Aliso Viejo Country Club,gpt-4o-mini
1,Agua Hedionda Nature Center,,,,,,,,-1,"{'base_prices': '$3,250 for Coastal Package, $...",...,True,False,False,False,False,False,,False,Agua Hedionda Nature Center,gpt-4o-mini
2,Aliso Viejo Wedgewood,Aliso Viejo,CA,92656.0,USA,Events@WedgewoodWeddings.com,866.966.3009,WedgewoodWeddings.com,128,{'base_prices': '$98 per person for Classic pa...,...,True,False,False,False,False,True,,False,Aliso Viejo Wedgewood,gpt-4o-mini
3,Alcazar Palm Springs,Palm Springs,California,-1.0,United States,Brittany@F10creative.com,-1,-1,245,"{'base_prices': '$8,000 non-refundable deposit...",...,True,False,False,False,False,False,,False,Alcazar Palm Springs,gpt-4o-mini


In [26]:
df_4o = pd.concat([df, df_old])
df_4o

Unnamed: 0,name,WeddingContactInfo_city,WeddingContactInfo_state,WeddingContactInfo_zip_code,WeddingContactInfo_country,WeddingContactInfo_email,WeddingContactInfo_phone,WeddingContactInfo_website,WeddingPriceInfo_price,WeddingPriceInfo_price_breakdown,...,WeddingFoodInfo_outside_dessert_allowed,WeddingFoodInfo_kosher_food,WeddingFoodInfo_halal_food,WeddingFoodInfo_east_asian_food,WeddingFoodInfo_indian_food,WeddingFoodInfo_gluten_free_food,WeddingFoodInfo_other_ethnic_food_style,WeddingFoodInfo_late_night_food,WeddingFoodInfo_name,model
0,a.o.c. Brentwood,Los Angeles,CA,90049.0,USA,,,,325,"{'base_prices': '$20,000 Sunday for 80 guests,...",...,True,False,False,False,False,True,,False,a.o.c. Brentwood,gpt-4o
0,Aliso Viejo Country Club,Aliso Viejo,California,,USA,,949.284.5460,alisogolf.com,165,{'base_prices': '$98 per person for Adore pack...,...,True,False,False,True,True,True,"Mediterranean, Cuban",False,Aliso Viejo Country Club,gpt-4o
1,Agua Hedionda Nature Center,,,,,,,,150,"{'base_prices': '$3,250 for Coastal Package, $...",...,True,False,False,False,False,False,,False,Agua Hedionda Nature Center,gpt-4o
2,Aliso Viejo Wedgewood,Aliso Viejo,CA,92656.0,USA,Events@WedgewoodWeddings.com,866.966.3009,WedgewoodWeddings.com,128,"{'base_prices': '$2,995 for Friday, Sunday, Sa...",...,True,False,False,False,False,False,,False,Aliso Viejo Wedgewood,gpt-4o
3,Alcazar Palm Springs,Palm Springs,California,,USA,Brittany@F10creative.com,,,450,"{'base_prices': '$8,000 venue fee, $18,000 foo...",...,True,False,False,False,False,False,,False,Alcazar Palm Springs,gpt-4o


In [38]:
df_4omini.to_csv("df_4omini.csv")

In [None]:
import json
import logging
import os
from datetime import datetime
from pathlib import Path

import google.generativeai as genai
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# Setup detailed logging
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables
load_dotenv()
# # api_key = os.getenv("GOOGLE_API_KEY")
# if not api_key:
#     raise ValueError(
#         "GOOGLE_API_KEY not found in environment variables. Please set it in your .env file."
#     )

# Configure the Gemini API
api_key = "AIzaSyAnFuVtRCMOC7NMJHgAbfrW9wzDahMG6gY"
genai.configure(api_key=api_key)


# Define helper functions
def generate_field_instructions(model_class: type[BaseModel]) -> str:
    """Generate field-specific instructions from a Pydantic model's docstrings."""
    instructions = []
    for field_name, field_info in model_class.model_fields.items():
        docstring = field_info.description or "No description provided."
        # Add type hint to prompt for numeric fields
        if field_name == "guest_capacity":
            instructions.append(
                f"- {field_name}: {docstring} (Return as an integer: 1, 50, 100, 150, 200, or 300, or -1 if not available)"
            )
        else:
            instructions.append(f"- {field_name}: {docstring}")
    return "\n".join(instructions)


def create_system_prompt(model_class: type[BaseModel]) -> str:
    """Generate a comprehensive prompt based on the model class and its field descriptions."""
    field_instructions = generate_field_instructions(model_class)
    return f"""You are an expert in wedding planning. You are extracting structured information about wedding venues.

First, carefully analyze all relevant information in the text. Consider both explicit statements and reasonable inferences.

Important instructions:
1. For each field, follow the specific guidelines below about how to handle ambiguous or missing information.
2. For boolean fields, return true/false values rather than "Yes"/"No" strings.
3. For string fields, provide detailed information or null if not available.
4. For numerical fields (e.g., guest_capacity), return integers or -1 if not available.
5. Begin by developing a comprehensive reasoning that considers all evidence before determining individual field values.
6. For fields with predefined choices (e.g., Literal types), you MUST return only the exact values listed in the options. If the information does not match any option, default to 'Not enough information' or the specified default value.
7. Return a valid JSON object with no additional text or commentary.

Field-specific instructions:
{field_instructions}

Now, extract the following venue information from the provided text:"""


# List of all Pydantic models to process
models = [
    WeddingContactInfo,
    WeddingPriceInfo,
    WeddingVenueStyle,
    WeddingVenueOther,
    WeddingFoodInfo,
]

# Process venues
venue_data = []
test_md_path = Path("test_md")
if not test_md_path.exists():
    print(f"Warning: {test_md_path} directory not found")
    exit()

md_files = list(test_md_path.glob("*.md"))
ai_model = "gemini-2.0-flash-exp"  # Adjusted to a valid Gemini model

for file in tqdm(md_files[2:5], desc="Processing venues", unit="file"):
    logging.info(f"Processing file: {file.name}")
    with open(file, "r", encoding="utf-8") as f:
        md_content = f.read()

    venue_name = file.stem
    venue_dict = {"name": venue_name}
    logging.debug(f"Venue dict initialized for {venue_name}: {venue_dict}")

    # Process each model for this venue
    for model_class in models:
        system_prompt = create_system_prompt(model_class)
        logging.debug(
            f"System prompt for {model_class.__name__}: {system_prompt[:500]}..."
        )  # Limit to first 500 chars
        try:
            # Use genai.GenerativeModel directly
            model = genai.GenerativeModel(model_name=ai_model)
            logging.debug(f"Generating content with model: {ai_model}")
            # Skip response_schema for WeddingVenueOther to avoid schema enforcement issues
            use_schema = model_class != WeddingVenueOther
            response = model.generate_content(
                contents=[
                    {
                        "role": "user",
                        "parts": [
                            f"{system_prompt}\n\nExtract venue information from this text about '{venue_name}':\n\n{md_content}"
                        ],
                    },
                ],
                generation_config=genai.types.GenerationConfig(
                    response_mime_type="application/json",
                    response_schema=model_class if use_schema else None,
                    temperature=0,
                ),
            )
            # Extract the raw response correctly
            if not response.candidates:
                raise ValueError(
                    f"No candidates returned in response for {model_class.__name__}"
                )
            raw_response = response.candidates[0].content.parts[0].text
            logging.debug(f"Raw response for {model_class.__name__}: {raw_response}")
            if model_class == WeddingVenueOther:
                print(f"Raw response for WeddingVenueOther: {raw_response}")
            # Attempt to parse JSON
            try:
                venue_info_dict = json.loads(raw_response)
                # Convert string numbers to integers for guest_capacity
                if (
                    model_class == WeddingVenueOther
                    and "guest_capacity" in venue_info_dict
                ):
                    if isinstance(venue_info_dict["guest_capacity"], str):
                        try:
                            venue_info_dict["guest_capacity"] = int(
                                venue_info_dict["guest_capacity"]
                            )
                        except ValueError:
                            venue_info_dict[
                                "guest_capacity"
                            ] = -1  # Fallback if conversion fails
            except json.JSONDecodeError as json_error:
                logging.error(
                    f"JSON Decode Error for {model_class.__name__}: {json_error}. Raw response: {raw_response}"
                )
                print(
                    f"✗ JSON Decode Error for {model_class.__name__} for {venue_name}: {json_error}. Raw response: {raw_response}"
                )
                venue_info_dict = {}
            # Manually validate with Pydantic for WeddingVenueOther
            if model_class == WeddingVenueOther:
                try:
                    venue_info_dict = model_class(**venue_info_dict).model_dump()
                except Exception as pydantic_error:
                    logging.error(
                        f"Pydantic Validation Error for {model_class.__name__}: {pydantic_error}. Parsed dict: {venue_info_dict}"
                    )
                    print(
                        f"✗ Pydantic Validation Error for {model_class.__name__} for {venue_name}: {pydantic_error}. Parsed dict: {venue_info_dict}"
                    )
                    venue_info_dict = {}
            # Prefix keys to avoid collisions between models
            prefixed_venue_info = {
                f"{model_class.__name__}_{k}": v for k, v in venue_info_dict.items()
            }
            venue_dict.update(prefixed_venue_info)
            logging.info(
                f"✓ Successfully processed {model_class.__name__} for: {venue_name}"
            )
        except Exception as e:
            logging.error(
                f"✗ Error processing {model_class.__name__} for {venue_name}: {e}"
            )
            print(f"✗ Error processing {model_class.__name__} for {venue_name}: {e}")
            # Skip WeddingVenueOther with placeholder if error persists
            if model_class == WeddingVenueOther:
                logging.warning(
                    f"Skipping WeddingVenueOther for {venue_name} due to error: {e}"
                )
                venue_dict.update({f"{model_class.__name__}_skipped": True})

    venue_data.append(venue_dict)
    logging.debug(f"Venue data appended: {venue_dict}")

# Create and save DataFrame
if venue_data:
    df = pd.DataFrame(venue_data)
    print(f"\nProcessed {len(venue_data)} venues")
    print(df)

    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"venues_data_{now}.csv"
    df["model"] = ai_model
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
else:
    print("No venue data was processed")

In [74]:
df_all = pd.concat([df_4o, df_4omini, df_preview, df_gemini])
df_all = (
    df_all.drop(columns=["WeddingVenueOther_skipped"])
    .reset_index()
    .drop(columns=["index"])
)

In [75]:
df_all.to_csv("final_demo_venues_data.csv")