# SerpAPI Feature Enhancement Test

This notebook tests scraping additional features for your existing restaurants.

## Goal
Extract valuable features from SerpAPI to enhance your chat agent without excessive API costs.

## Strategy
1. Load your existing scraped restaurants
2. Test API calls to get detailed info for a FEW sample restaurants
3. Validate which features are consistently available
4. Calculate costs and decide which features to scrape for all restaurants

In [9]:
import os
import requests
import json
import pandas as pd
from datetime import datetime, time as dt_time
from typing import Dict, Optional
import time

# Setup
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
if not SERPAPI_API_KEY:
    print("‚ö†Ô∏è SERPAPI_API_KEY not found in environment!")
else:
    print("‚úÖ API Key loaded")

‚úÖ API Key loaded


## Step 1: Load Your Existing Data

In [12]:
# Load your existing scraped restaurants
DATA_PATH = "data/google-data/google-restaurants-place/chc_google_places_v1.csv"

if os.path.exists(DATA_PATH):
    df_existing = pd.read_csv(DATA_PATH)
    print(f"Loaded {len(df_existing)} existing restaurants")
    print(f"\nCurrent columns: {list(df_existing.columns)}")
    print(f"\nSample data:")
    display(df_existing.head(3))
else:
    print(f"‚ùå File not found: {DATA_PATH}")
    print("Please update DATA_PATH to your parquet file location")

Loaded 729 existing restaurants

Current columns: ['place_id', 'data_id', 'title', 'address', 'lat', 'lon', 'type', 'rating', 'reviews_count', 'url', 'search_query', 'start_offset', 'serpapi_search_id', 'unique_key']

Sample data:


Unnamed: 0,place_id,data_id,title,address,lat,lon,type,rating,reviews_count,url,search_query,start_offset,serpapi_search_id,unique_key
0,ChIJrSTbF0CLMW0RfKhojQI8NEE,0x6d318b4017db24ad:0x41343c028d68a87c,Riverside Market,"96 Oxford Terrace, Christchurch Central City, ...",-43.533842,172.633912,Market,4.7,6777.0,,"food court in Christchurch, New Zealand",0,6915066809a51a3fe9ca6400,ChIJrSTbF0CLMW0RfKhojQI8NEE
1,ChIJ715FmhiKMW0R3l3ead1fVoc,0x6d318a189a455eef:0x87565fdd69de5dde,Little High Eatery,"255 Saint Asaph Street, Christchurch Central C...",-43.535783,172.64093,Restaurant,4.6,4772.0,,"thai restaurant in Christchurch, New Zealand",20,691506ae4707f16394f2337e,ChIJ715FmhiKMW0R3l3ead1fVoc
2,ChIJz60V9fKKMW0RLDzCMwHWodA,0x6d318af2f515adcf:0xd0a1d60133c23c2c,McDonald's Riccarton,"CNR RICCARTON RD & MATIPO ST, 155 Riccarton Ro...",-43.530287,172.595961,Fast food restaurant,3.7,3414.0,,"food court in Christchurch, New Zealand",20,6915066cb8b4a5b27f411dfb,ChIJz60V9fKKMW0RLDzCMwHWodA


## Step 2: Test API Call for a Single Restaurant

We'll use the `place_id` to get detailed information about a restaurant.

**Important:** Each API call costs credits, so we test with just 1-3 restaurants first!

In [13]:
def get_place_details(place_id: str) -> Dict:
    """
    Fetch detailed place information using Google Maps Place endpoint.
    This is more detailed than the search results.
    """
    params = {
        "engine": "google_maps",
        "type": "place",
        "place_id": place_id,
        "api_key": SERPAPI_API_KEY,
        "hl": "en"
    }
    
    r = requests.get("https://serpapi.com/search.json", params=params, timeout=60)
    
    if r.status_code != 200:
        raise RuntimeError(f"SerpAPI error {r.status_code}: {r.text[:200]}")
    
    return r.json()


# Test with ONE restaurant first
if 'df_existing' in locals() and not df_existing.empty:
    # Pick a restaurant with high reviews (likely to have complete data)
    test_place = df_existing.nlargest(1, 'reviews_count').iloc[0]
    
    print(f"üß™ Testing with: {test_place['title']}")
    print(f"   Place ID: {test_place['place_id']}")
    print(f"   Reviews: {test_place['reviews_count']}")
    print(f"\n‚è≥ Fetching detailed data...\n")
    
    test_response = get_place_details(test_place['place_id'])
    
    # Save for inspection
    with open('test_place_response.json', 'w') as f:
        json.dump(test_response, f, indent=2)
    
    print("Response saved to 'test_place_response.json'")
    print(f"\nAvailable top-level keys: {list(test_response.keys())}")

üß™ Testing with: Riverside Market
   Place ID: ChIJrSTbF0CLMW0RfKhojQI8NEE
   Reviews: 6777.0

‚è≥ Fetching detailed data...

Response saved to 'test_place_response.json'

Available top-level keys: ['search_metadata', 'search_parameters', 'place_results']


## Step 3: Inspect What Features Are Available

In [14]:
# Let's look at the structure of the response
if 'test_response' in locals():
    # Check if there's a place_results key (detailed info)
    place_data = test_response.get('place_results', {})
    
    if place_data:
        print("üéØ Available fields in place_results:")
        print("=" * 60)
        
        important_fields = [
            'title', 'rating', 'reviews', 'type', 'types',
            'address', 'phone', 'website',
            'operating_hours', 'open_state', 'hours',
            'description', 'price',
            'service_options', 'order_online',
            'gps_coordinates'
        ]
        
        for field in important_fields:
            if field in place_data:
                value = place_data[field]
                # Truncate long values for display
                if isinstance(value, str) and len(value) > 100:
                    value = value[:100] + "..."
                print(f"‚úÖ {field:20s}: {value}")
            else:
                print(f"‚ùå {field:20s}: NOT AVAILABLE")
        
        print("\n" + "=" * 60)
        print("\nüìã Full place_results keys:")
        print(list(place_data.keys()))
    else:
        print("‚ö†Ô∏è No 'place_results' found. Let's check the raw response:")
        print(json.dumps(test_response, indent=2)[:1000])

üéØ Available fields in place_results:
‚úÖ title               : Riverside Market
‚úÖ rating              : 4.7
‚úÖ reviews             : 6827
‚úÖ type                : ['Market']
‚ùå types               : NOT AVAILABLE
‚úÖ address             : 96 Oxford Terrace, Christchurch Central City, Christchurch 8011, New Zealand
‚úÖ phone               : +64 27 770 5599
‚úÖ website             : https://riverside.nz/
‚ùå operating_hours     : NOT AVAILABLE
‚úÖ open_state          : Open ¬∑ Closes 8‚ÄØPM
‚úÖ hours               : [{'tuesday': '7:30\u202fAM‚Äì8\u202fPM'}, {'wednesday': '7:30\u202fAM‚Äì8\u202fPM'}, {'thursday': '7:30\u202fAM‚Äì9\u202fPM'}, {'friday': '7:30\u202fAM‚Äì9\u202fPM'}, {'saturday': '7:30\u202fAM‚Äì9\u202fPM'}, {'sunday': '7:30\u202fAM‚Äì8\u202fPM'}, {'monday': '7:30\u202fAM‚Äì8\u202fPM'}]
‚úÖ description         : Buzzy indoor market with retailers selling locally sourced food, plus restaurants, cafes & bars.
‚ùå price               : NOT AVAILABLE
‚úÖ service_options 

## Step 4: Extract Enhanced Features Function

In [16]:
def extract_enhanced_features(place_data: Dict) -> Dict:
    """
    Extract valuable features from detailed place data.
    Returns a dict with new features to add to your existing data.
    
    UPDATED: Now handles multiple hours formats from SerpAPI!
    """
    
    # Service options (delivery, takeout, dine-in)
    service_opts = place_data.get('service_options', {})
    
    # Types (more detailed categorization)
    # Handle both 'type' (list) and 'types' (list) fields
    types_list = place_data.get('types') or place_data.get('type', [])
    if isinstance(types_list, list):
        types_str = ", ".join(types_list)
    else:
        types_str = str(types_list) if types_list else None
    
    # Operating hours - handle multiple formats
    operating_hours = {}
    
    # Format 1: Direct dict (e.g., {'monday': '9 AM-5 PM', ...})
    if 'operating_hours' in place_data and isinstance(place_data['operating_hours'], dict):
        operating_hours = place_data['operating_hours']
    
    # Format 2: List of dicts (e.g., [{'monday': '9 AM-5 PM'}, {'tuesday': '9 AM-5 PM'}, ...])
    elif 'hours' in place_data and isinstance(place_data['hours'], list):
        for day_dict in place_data['hours']:
            if isinstance(day_dict, dict):
                operating_hours.update(day_dict)
    
    # Format 3: 'hours' is already a dict
    elif 'hours' in place_data and isinstance(place_data['hours'], dict):
        operating_hours = place_data['hours']
    
    # Open state (e.g., "Open now", "Closes at 10 PM")
    open_state = place_data.get('open_state')
    
    return {
        # Contact info
        'phone': place_data.get('phone'),
        'website': place_data.get('website'),
        
        # Operating hours
        'open_state': open_state,
        'operating_hours_json': json.dumps(operating_hours) if operating_hours else None,
        
        # Monday-Sunday hours (for easier querying)
        'hours_monday': operating_hours.get('monday'),
        'hours_tuesday': operating_hours.get('tuesday'),
        'hours_wednesday': operating_hours.get('wednesday'),
        'hours_thursday': operating_hours.get('thursday'),
        'hours_friday': operating_hours.get('friday'),
        'hours_saturday': operating_hours.get('saturday'),
        'hours_sunday': operating_hours.get('sunday'),
        
        # Service options
        'has_delivery': service_opts.get('delivery') or service_opts.get('no_contact_delivery'),
        'has_takeout': service_opts.get('takeout'),
        'has_dine_in': service_opts.get('dine_in'),
        'has_onsite_services': service_opts.get('onsite_services'),
        
        # Additional info
        'description': place_data.get('description'),
        'price_level': place_data.get('price'),
        'types_detailed': types_str,
        
        # Metadata
        'details_scraped_at': datetime.now().isoformat(),
    }


# After replacing the function, re-run this cell to see the difference:
if 'test_response' in locals():
    place_data = test_response.get('place_results', {})
    if place_data:
        enhanced_features = extract_enhanced_features(place_data)
        
        print("üé® Extracted Enhanced Features:")
        print("=" * 60)
        extracted_count = 0
        for key, value in enhanced_features.items():
            if value is not None:
                extracted_count += 1
                # Truncate long values
                display_value = str(value)[:100] + "..." if isinstance(value, str) and len(str(value)) > 100 else value
                print(f"‚úÖ {key:25s}: {display_value}")
        
        print("=" * 60)
        print(f"\nüìä Total features extracted: {extracted_count}/{len(enhanced_features)}")
        print(f"   ({(extracted_count/len(enhanced_features)*100):.1f}% coverage)")

üé® Extracted Enhanced Features (UPDATED):
‚úÖ phone                    : +64 27 770 5599
‚úÖ website                  : https://riverside.nz/
‚úÖ open_state               : Open ¬∑ Closes 8‚ÄØPM
‚úÖ operating_hours_json     : {"tuesday": "7:30\u202fAM\u20138\u202fPM", "wednesday": "7:30\u202fAM\u20138\u202fPM", "thursday": "...
‚úÖ hours_monday             : 7:30‚ÄØAM‚Äì8‚ÄØPM
‚úÖ hours_tuesday            : 7:30‚ÄØAM‚Äì8‚ÄØPM
‚úÖ hours_wednesday          : 7:30‚ÄØAM‚Äì8‚ÄØPM
‚úÖ hours_thursday           : 7:30‚ÄØAM‚Äì9‚ÄØPM
‚úÖ hours_friday             : 7:30‚ÄØAM‚Äì9‚ÄØPM
‚úÖ hours_saturday           : 7:30‚ÄØAM‚Äì9‚ÄØPM
‚úÖ hours_sunday             : 7:30‚ÄØAM‚Äì8‚ÄØPM
‚úÖ has_onsite_services      : True
‚úÖ description              : Buzzy indoor market with retailers selling locally sourced food, plus restaurants, cafes & bars.
‚úÖ types_detailed           : Market
‚úÖ details_scraped_at       : 2025-12-02T15:33:46.800917

üìä Total features extracted: 15/19
   (78.9% coverage)


## Step 5: Test with Multiple Restaurants (3-5 samples)

Before running the full pipeline, let's test with a small sample to:
1. Verify data consistency
2. Calculate actual API costs
3. Identify any edge cases

In [17]:
# Test with top 5 restaurants by reviews
SAMPLE_SIZE = 5

if 'df_existing' in locals() and not df_existing.empty:
    sample_restaurants = df_existing.nlargest(SAMPLE_SIZE, 'reviews_count')
    
    print(f"üß™ Testing with {len(sample_restaurants)} restaurants:")
    print("=" * 60)
    
    enhanced_data = []
    errors = []
    
    for idx, row in sample_restaurants.iterrows():
        try:
            print(f"\nüìç [{idx+1}/{len(sample_restaurants)}] {row['title']}")
            
            # Fetch details
            response = get_place_details(row['place_id'])
            place_data = response.get('place_results', {})
            
            if place_data:
                features = extract_enhanced_features(place_data)
                
                # Combine with existing data
                combined = {**row.to_dict(), **features}
                enhanced_data.append(combined)
                
                # Print key features
                print(f"   ‚úÖ Phone: {features.get('phone', 'N/A')}")
                print(f"   ‚úÖ Website: {'Yes' if features.get('website') else 'No'}")
                print(f"   ‚úÖ Hours: {'Yes' if features.get('hours_monday') else 'No'}")
                print(f"   ‚úÖ Delivery: {features.get('has_delivery', False)}")
            else:
                print(f"   ‚ö†Ô∏è No place_results found")
                errors.append({'place_id': row['place_id'], 'title': row['title'], 'error': 'No place_results'})
            
            # Rate limiting
            time.sleep(2)
            
        except Exception as e:
            print(f"   ‚ùå Error: {str(e)}")
            errors.append({'place_id': row['place_id'], 'title': row['title'], 'error': str(e)})
    
    # Create DataFrame
    df_enhanced = pd.DataFrame(enhanced_data)
    
    print("\n" + "=" * 60)
    print(f"‚úÖ Successfully enhanced: {len(df_enhanced)} restaurants")
    print(f"‚ùå Errors: {len(errors)}")
    
    if errors:
        print("\n‚ö†Ô∏è Errors encountered:")
        for err in errors:
            print(f"   - {err['title']}: {err['error']}")

üß™ Testing with 5 restaurants:

üìç [1/5] Riverside Market
   ‚úÖ Phone: +64 27 770 5599
   ‚úÖ Website: Yes
   ‚úÖ Hours: Yes
   ‚úÖ Delivery: None

üìç [2/5] Little High Eatery
   ‚úÖ Phone: +64 210 208 4444
   ‚úÖ Website: Yes
   ‚úÖ Hours: Yes
   ‚úÖ Delivery: True

üìç [3/5] McDonald's Riccarton
   ‚úÖ Phone: +64 3 343 0549
   ‚úÖ Website: Yes
   ‚úÖ Hours: Yes
   ‚úÖ Delivery: True

üìç [4/5] C1 Espresso
   ‚úÖ Phone: None
   ‚úÖ Website: Yes
   ‚úÖ Hours: Yes
   ‚úÖ Delivery: None

üìç [5/5] The Rockpool Bar
   ‚úÖ Phone: +64 3 374 9461
   ‚úÖ Website: Yes
   ‚úÖ Hours: Yes
   ‚úÖ Delivery: True

‚úÖ Successfully enhanced: 5 restaurants
‚ùå Errors: 0


## Step 6: Analyze Feature Coverage

In [18]:
# Analyze which features are most commonly available
if 'df_enhanced' in locals() and not df_enhanced.empty:
    print("üìä Feature Coverage Analysis:")
    print("=" * 60)
    
    new_features = [
        'phone', 'website', 'open_state', 'hours_monday',
        'has_delivery', 'has_takeout', 'has_dine_in',
        'description', 'price_level', 'types_detailed'
    ]
    
    for feature in new_features:
        if feature in df_enhanced.columns:
            # Count non-null values
            coverage = df_enhanced[feature].notna().sum()
            percentage = (coverage / len(df_enhanced)) * 100
            
            status = "‚úÖ" if percentage >= 80 else "‚ö†Ô∏è" if percentage >= 50 else "‚ùå"
            print(f"{status} {feature:20s}: {coverage}/{len(df_enhanced)} ({percentage:.1f}%)")
    
    print("\n" + "=" * 60)
    print("\nüìã Sample of enhanced data:")
    display(df_enhanced[['title', 'phone', 'website', 'open_state', 'has_delivery', 'has_takeout']].head())

üìä Feature Coverage Analysis:
‚úÖ phone               : 4/5 (80.0%)
‚úÖ website             : 5/5 (100.0%)
‚úÖ open_state          : 5/5 (100.0%)
‚úÖ hours_monday        : 5/5 (100.0%)
‚ö†Ô∏è has_delivery        : 3/5 (60.0%)
‚ö†Ô∏è has_takeout         : 3/5 (60.0%)
‚úÖ has_dine_in         : 4/5 (80.0%)
‚úÖ description         : 4/5 (80.0%)
‚úÖ price_level         : 4/5 (80.0%)
‚úÖ types_detailed      : 5/5 (100.0%)


üìã Sample of enhanced data:


Unnamed: 0,title,phone,website,open_state,has_delivery,has_takeout
0,Riverside Market,+64 27 770 5599,https://riverside.nz/,Open ¬∑ Closes 8‚ÄØPM,,
1,Little High Eatery,+64 210 208 4444,http://littlehigh.co.nz/,Open ¬∑ Closes 10‚ÄØPM,True,True
2,McDonald's Riccarton,+64 3 343 0549,https://mcdonalds.co.nz/find-us/restaurants?ut...,Open 24 hours,True,
3,C1 Espresso,,https://coffee.c1espresso.co.nz/,Open ¬∑ Closes 9‚ÄØPM,,True
4,The Rockpool Bar,+64 3 374 9461,http://www.therockpool.co.nz/,Open ¬∑ Closes 3‚ÄØAM,True,True


## Step 7: Operating Hours Parser (for "open now" queries)

Let's create helper functions to parse operating hours and check if a place is open.

In [19]:
def parse_hours_string(hours_str: str) -> Optional[tuple]:
    """
    Parse hours string like "11 AM‚Äì11 PM" into (open_time, close_time).
    Returns None if closed or cannot parse.
    """
    if not hours_str or pd.isna(hours_str):
        return None
    
    hours_str = hours_str.strip().lower()
    
    # Check if closed
    if 'closed' in hours_str:
        return None
    
    # Check for 24 hours
    if '24 hours' in hours_str or 'open 24 hours' in hours_str:
        return (dt_time(0, 0), dt_time(23, 59))
    
    # Try to parse format like "11 AM‚Äì11 PM" or "11:30 AM‚Äì10:00 PM"
    try:
        # Split by dash or en-dash
        parts = hours_str.replace('‚Äì', '-').replace('‚Äî', '-').split('-')
        if len(parts) != 2:
            return None
        
        open_str, close_str = [p.strip() for p in parts]
        
        # Parse times (this is simplified - you may need more robust parsing)
        def parse_time(time_str):
            time_str = time_str.replace(' ', '').upper()
            
            # Handle formats like "11AM" or "11:30PM"
            if 'AM' in time_str or 'PM' in time_str:
                is_pm = 'PM' in time_str
                time_str = time_str.replace('AM', '').replace('PM', '')
                
                if ':' in time_str:
                    hour, minute = time_str.split(':')
                else:
                    hour = time_str
                    minute = '0'
                
                hour = int(hour)
                minute = int(minute)
                
                if is_pm and hour != 12:
                    hour += 12
                elif not is_pm and hour == 12:
                    hour = 0
                
                return dt_time(hour, minute)
            
            return None
        
        open_time = parse_time(open_str)
        close_time = parse_time(close_str)
        
        if open_time and close_time:
            return (open_time, close_time)
        
    except Exception as e:
        pass
    
    return None


def is_open_now(hours_dict: Dict, current_datetime: datetime = None) -> Optional[bool]:
    """
    Check if a restaurant is currently open based on its operating hours.
    
    Args:
        hours_dict: Dict with keys like 'hours_monday', 'hours_tuesday', etc.
        current_datetime: datetime to check (defaults to now)
    
    Returns:
        True if open, False if closed, None if cannot determine
    """
    if current_datetime is None:
        current_datetime = datetime.now()
    
    # Get day name
    day_name = current_datetime.strftime('%A').lower()
    hours_key = f'hours_{day_name}'
    
    if hours_key not in hours_dict:
        return None
    
    hours_str = hours_dict[hours_key]
    parsed = parse_hours_string(hours_str)
    
    if parsed is None:
        return False  # Closed
    
    open_time, close_time = parsed
    current_time = current_datetime.time()
    
    # Handle cases where closing time is past midnight
    if close_time < open_time:
        # e.g., 11 PM to 2 AM
        return current_time >= open_time or current_time <= close_time
    else:
        return open_time <= current_time <= close_time


# Test the functions
if 'df_enhanced' in locals() and not df_enhanced.empty:
    print("Testing 'is open now' functionality:")
    print("=" * 60)
    
    for idx, row in df_enhanced.head(3).iterrows():
        print(f"\nüìç {row['title']}")
        
        # Parse Monday hours as example
        if 'hours_monday' in row and pd.notna(row['hours_monday']):
            print(f"   Monday hours: {row['hours_monday']}")
            parsed = parse_hours_string(row['hours_monday'])
            if parsed:
                print(f"   Parsed: {parsed[0]} to {parsed[1]}")
        
        # Check if open now
        is_open = is_open_now(row)
        print(f"   Open now: {is_open}")

Testing 'is open now' functionality:

üìç Riverside Market
   Monday hours: 7:30‚ÄØAM‚Äì8‚ÄØPM
   Parsed: 07:30:00 to 20:00:00
   Open now: True

üìç Little High Eatery
   Monday hours: 11‚ÄØAM‚Äì10‚ÄØPM
   Parsed: 11:00:00 to 22:00:00
   Open now: True

üìç McDonald's Riccarton
   Monday hours: Open 24 hours
   Parsed: 00:00:00 to 23:59:00
   Open now: True


In [21]:
import pandas as pd

df = pd.read_csv(DATA_PATH)

print(f"üìä Your Dataset:")
print(f"   Total places: {len(df)}")
print(f"   Estimated cost: ${len(df) * 0.01:.2f}")

# Recommended: Filter by reviews
for min_reviews in [10, 20, 50]:
    filtered = df[df['reviews_count'] >= min_reviews]
    print(f"   >={min_reviews} reviews: {len(filtered)} places (${len(filtered) * 0.01:.2f})")

üìä Your Dataset:
   Total places: 729
   Estimated cost: $7.29
   >=10 reviews: 716 places ($7.16)
   >=20 reviews: 708 places ($7.08)
   >=50 reviews: 677 places ($6.77)


## Step 9: Save Sample Enhanced Data