In [None]:
import os, time, pathlib, pprint, requests, json
from datetime import datetime, timedelta

EP = os.getenv("BROWSER_ENDPOINT", "http://localhost:8004")
SCRAPED = pathlib.Path("/storage/scraped_data")

def wait_for(job_id, every=3):
    print(f"⏳ Waiting for job {job_id}...")
    while True:
        rec = requests.get(f"{EP}/jobs/{job_id}").json()
        status = rec["status"]
        if status not in {"finished", "error"}:
            print(f"\r⏱️  {rec['status_with_elapsed']}", end="")
        else:
            print(f"\n✅ {status.upper()}")
            return rec
        time.sleep(every)

def submit(task, payload):
    print(f"🚀 Submitting {task} task...")
    print(f"📝 Payload: {json.dumps(payload, indent=2)}")
    r = requests.post(f"{EP}/jobs/{task}", json=payload)
    r.raise_for_status()
    jid = r.json()["job_id"]
    print(f"🆔 Job ID: {jid}")
    return wait_for(jid)

def analyze_results(result, test_name):
    print(f"\n" + "="*60)
    print(f"📊 ANALYSIS: {test_name}")
    print("="*60)
    
    if result["status"] == "error":
        print(f"❌ FAILED: {result.get('error', 'Unknown error')}")
        return
    
    if "result" in result:
        res = result["result"]
        metadata = res.get("search_metadata", {})
        hotels = res.get("hotels", [])
        
        print(f"✅ SUCCESS: Found {len(hotels)} hotels")
        print(f"📍 Location: {metadata.get('location')}")
        print(f"📅 Check-in: {metadata.get('check_in')}")
        print(f"📅 Check-out: {metadata.get('check_out')}")
        print(f"🏨 Nights: {metadata.get('nights')}")
        print(f"🔧 Method: {metadata.get('extraction_method')}")
        
        if hotels:
            avg_price = sum(h.get('price_per_night', 0) for h in hotels) / len([h for h in hotels if h.get('price_per_night')])
            avg_rating = sum(h.get('rating', 0) for h in hotels) / len([h for h in hotels if h.get('rating')])
            
            if avg_price > 0:
                print(f"💰 Avg Price: ${avg_price:.0f}/night")
            if avg_rating > 0:
                print(f"⭐ Avg Rating: {avg_rating:.1f}/10")
                
        print(f"📊 Deep Scraping: {'Yes' if metadata.get('deep_scrape_enabled') else 'No'}")
        print(f"⏱️  Completed at: {metadata.get('search_completed_at')}")
    else:
        print(f"❌ Task failed or returned unexpected result")
        pprint.pp(result)

# Calculate future dates for testing
today = datetime.now()
check_in_1 = (today + timedelta(days=30)).strftime("%Y-%m-%d")
check_out_1 = (today + timedelta(days=33)).strftime("%Y-%m-%d")
check_in_2 = (today + timedelta(days=45)).strftime("%Y-%m-%d")
check_out_2 = (today + timedelta(days=47)).strftime("%Y-%m-%d")
check_in_3 = (today + timedelta(days=60)).strftime("%Y-%m-%d")
check_out_3 = (today + timedelta(days=64)).strftime("%Y-%m-%d")

print(f"📅 Test dates calculated:")
print(f"   Test 1: {check_in_1} to {check_out_1}")
print(f"   Test 2: {check_in_2} to {check_out_2}")
print(f"   Test 3: {check_in_3} to {check_out_3}")

In [3]:
# Test Case 1: Basic search (Riyadh hotels)
basic_search = {
    "location": "Riyadh, Saudi Arabia",
    "check_in": check_in_1,
    "check_out": check_out_1,
    "adults": 2,
    "max_results": 10
}

result_1 = submit("booking-hotels", basic_search)
analyze_results(result_1, "Basic Hotel Search - Riyadh")

🚀 Submitting booking-hotels task...
📝 Payload: {
  "location": "Riyadh, Saudi Arabia",
  "check_in": "2025-09-19",
  "check_out": "2025-09-22",
  "adults": 2,
  "max_results": 10
}
🆔 Job ID: 675a78e03ee047bca893b61b5287d0a2
⏳ Waiting for job 675a78e03ee047bca893b61b5287d0a2...
⏱️  running 1m 48s
✅ FINISHED

📊 ANALYSIS: Basic Hotel Search - Riyadh
✅ SUCCESS: Found 10 hotels
📍 Location: Riyadh, Saudi Arabia
📅 Dates: 2025-09-19 to 2025-09-22
🏨 Nights: 3
💰 Avg Price: $939.0/night
⭐ Avg Rating: 9.3/10
⏱️  Execution Time: 109.1s
📁 Data File: hotels_data.json


In [None]:
# Test Case 2: Budget hotels with filters
budget_search = {
    "location": "Dubai, UAE",
    "check_in": check_in_2,
    "check_out": check_out_2,
    "adults": 1,
    "max_price": 300,
    "min_rating": 7.0,
    "max_results": 15
}

result_2 = submit("booking-hotels", budget_search)
analyze_results(result_2, "Budget Hotels - Dubai")

In [4]:
# # Test Case 3: Luxury family trip
# luxury_search = {
#     "location": "Jeddah, Saudi Arabia",
#     "check_in": check_in_3,
#     "check_out": check_out_3,
#     "adults": 2,
#     "children": 2,
#     "rooms": 2,
#     "min_price": 500,
#     "star_rating": [4, 5],
#     "amenities": ["pool", "wifi", "gym"],
#     "max_results": 20
# }

# result_3 = submit("booking-hotels", luxury_search)
# analyze_results(result_3, "Luxury Family Trip - Jeddah")

In [5]:
# # Test Case 4: Business trip (no reviews needed)
# business_search = {
#     "location": "KAUST, Thuwal, Saudi Arabia",
#     "check_in": check_in_1,
#     "check_out": check_out_1,
#     "adults": 1,
#     "amenities": ["wifi"],
#     "include_reviews": False,
#     "max_results": 5
# }

# result_4 = submit("booking-hotels", business_search)
# analyze_results(result_4, "Business Trip - KAUST Area")

In [6]:
# # Test Case 5: Conference attendee scenario (proximity search)
# conference_search = {
#     "location": "Riyadh International Convention Center, Riyadh",
#     "search_radius": "5km",
#     "check_in": check_in_2,
#     "check_out": check_out_2,
#     "adults": 1,
#     "max_results": 15
# }

# result_5 = submit("booking-hotels", conference_search)
# analyze_results(result_5, "Conference Proximity Search")

In [7]:
# # Test Summary
# print("📋 BOOKING HOTELS TASK - TEST SUMMARY")
# print("="*60)

# test_results = [
#     ("Basic Search - Riyadh", result_1),
#     ("Budget Hotels - Dubai", result_2),
#     ("Luxury Family - Jeddah", result_3),
#     ("Business Trip - KAUST", result_4),
#     ("Conference Proximity", result_5)
# ]

# successful_tests = 0
# failed_tests = 0
# total_hotels = 0
# total_execution_time = 0

# for test_name, result in test_results:
#     if result["status"] == "finished" and result.get("result", {}).get("success"):
#         successful_tests += 1
#         hotels_found = result["result"].get("hotels_found", 0)
#         exec_time = result["result"].get("execution_time_seconds", 0)
#         total_hotels += hotels_found
#         total_execution_time += exec_time
#         print(f"✅ {test_name}: {hotels_found} hotels ({exec_time}s)")
#     else:
#         failed_tests += 1
#         error_msg = result.get("error", "Unknown error")
#         print(f"❌ {test_name}: FAILED - {error_msg}")

# print("\n📊 OVERALL STATISTICS:")
# print(f"   ✅ Successful tests: {successful_tests}/{len(test_results)}")
# print(f"   ❌ Failed tests: {failed_tests}/{len(test_results)}")
# print(f"   🏨 Total hotels collected: {total_hotels}")
# print(f"   ⏱️  Total execution time: {total_execution_time:.1f}s")
# print(f"   📁 Data files saved to: {SCRAPED}/booking-hotels/")

# if successful_tests == len(test_results):
#     print("\n🎉 ALL TESTS PASSED! The booking-hotels task is working perfectly.")
# elif successful_tests >= len(test_results) * 0.75:
#     print("\n✅ Most tests passed. The booking-hotels task is working well with minor issues.")
# else:
#     print("\n⚠️  Multiple tests failed. The booking-hotels task needs debugging.")

# print("\n🔍 To examine detailed results, check the JSON files in the data directories.")

In [None]:
# Test NEW DOM Scraping Implementation v6.0
print("🧪 Testing NEW DOM-based scraper v6.0 with Dubai search...")
print("🎯 Expected: Dubai hotels with UAE addresses (not Saudi Arabia weekend deals)")

dubai_test_new = {
    "location": "Dubai, UAE",
    "check_in": check_in_2,
    "check_out": check_out_2,
    "adults": 2,
    "max_results": 5
}

result_dubai_new = submit("booking-hotels", dubai_test_new)

# Enhanced analysis for new implementation
if result_dubai_new["status"] == "finished":
    res = result_dubai_new["result"]
    hotels = res.get("hotels", [])
    metadata = res.get("search_metadata", {})
    
    print(f"\n🔍 NEW IMPLEMENTATION ANALYSIS:")
    print(f"📊 Total hotels found: {len(hotels)}")
    print(f"🔧 Extraction method: {metadata.get('extraction_method', 'unknown')}")
    
    dubai_count = 0
    saudi_count = 0
    search_result_count = 0
    weekend_deal_count = 0
    hotels_with_prices = 0
    total_price = 0
    
    for i, hotel in enumerate(hotels, 1):
        name = hotel.get('name', 'Unknown')
        address = hotel.get('address', '').lower()
        source = hotel.get('source', 'unknown')
        price = hotel.get('price_per_night')
        rating = hotel.get('rating', 'N/A')
        completeness = hotel.get('data_completeness', 0)
        
        print(f"\n🏨 Hotel #{i}: {name}")
        print(f"   📍 Address: {address or 'N/A'}")
        print(f"   💰 Price: ${price}/night" if price else "   💰 Price: N/A")
        print(f"   ⭐ Rating: {rating}/10")
        print(f"   🔧 Source: {source}")
        print(f"   📊 Completeness: {completeness}%")
        
        # Count source types
        if source == 'search_results':
            search_result_count += 1
        elif source == 'weekend_deals':
            weekend_deal_count += 1
        
        # Count prices
        if price:
            hotels_with_prices += 1
            total_price += price
        
        # Analyze location
        if address:
            if any(keyword in address for keyword in ['dubai', 'uae', 'united arab emirates', 'emirate']):
                dubai_count += 1
                print(f"   ✅ DUBAI/UAE HOTEL CONFIRMED")
            elif any(keyword in address for keyword in ['saudi', 'riyadh', 'jeddah', 'mecca', 'dammam', 'khobar']):
                saudi_count += 1
                print(f"   🇸🇦 SAUDI ARABIA HOTEL (LOCATION MISMATCH!)")
            else:
                print(f"   ❓ Location unclear from address")
        else:
            print(f"   ❓ No address data")
    
    # Calculate averages
    avg_price = total_price / hotels_with_prices if hotels_with_prices > 0 else 0
    avg_completeness = sum(h.get('data_completeness', 0) for h in hotels) / len(hotels) if hotels else 0
    
    print(f"\n📊 NEW IMPLEMENTATION METRICS:")
    print(f"   🎯 Search Results: {search_result_count} hotels")
    print(f"   🎪 Weekend Deals: {weekend_deal_count} hotels")  
    print(f"   💰 Hotels with prices: {hotels_with_prices}/{len(hotels)}")
    print(f"   💵 Average price: ${avg_price:.0f}/night" if avg_price > 0 else "   💵 Average price: N/A")
    print(f"   📊 Average completeness: {avg_completeness:.1f}%")
    
    print(f"\n🌍 LOCATION ACCURACY:")
    print(f"   🇦🇪 Dubai/UAE hotels: {dubai_count}")
    print(f"   🇸🇦 Saudi Arabia hotels: {saudi_count}")
    print(f"   ❓ Unclear location: {len(hotels) - dubai_count - saudi_count}")
    
    # SUCCESS ANALYSIS
    print(f"\n🎯 SUCCESS ANALYSIS:")
    
    if search_result_count > weekend_deal_count:
        print(f"✅ SOURCE SUCCESS: More search results ({search_result_count}) than weekend deals ({weekend_deal_count})")
    else:
        print(f"❌ SOURCE ISSUE: More weekend deals ({weekend_deal_count}) than search results ({search_result_count})")
    
    if dubai_count > saudi_count:
        print(f"✅ LOCATION SUCCESS: More Dubai hotels ({dubai_count}) than Saudi hotels ({saudi_count})")
        if dubai_count >= len(hotels) * 0.8:
            print(f"🎉 EXCELLENT: {dubai_count}/{len(hotels)} hotels are location-accurate (≥80%)")
        else:
            print(f"✅ GOOD: {dubai_count}/{len(hotels)} hotels are location-accurate")
    elif saudi_count > 0:
        print(f"❌ LOCATION ISSUE: Found {saudi_count} Saudi hotels when searching for Dubai")
    else:
        print(f"❓ LOCATION UNCLEAR: Need more address data to verify accuracy")
    
    if metadata.get('extraction_method') == 'quick_extraction':
        print(f"✅ METHOD SUCCESS: Using quick DOM extraction (not GraphQL)")
    
    # OVERALL VERDICT
    if dubai_count >= len(hotels) * 0.8 and search_result_count > weekend_deal_count:
        print(f"\n🎉 INTEGRATION SUCCESS: New DOM scraping is working correctly!")
        print(f"   • Location-specific results: {dubai_count}/{len(hotels)} hotels")
        print(f"   • Actual search results: {search_result_count} vs {weekend_deal_count} weekend deals")
        print(f"   • Price extraction: {hotels_with_prices}/{len(hotels)} hotels")
    elif dubai_count > saudi_count:
        print(f"\n✅ PARTIAL SUCCESS: Significant improvement over old GraphQL approach")
        print(f"   • Getting Dubai hotels instead of Saudi weekend deals")
        print(f"   • Room for improvement in data completeness")
    else:
        print(f"\n⚠️  NEEDS WORK: Still issues with location accuracy or data extraction")
        
else:
    print(f"❌ Test FAILED: {result_dubai_new.get('error', 'Unknown error')}")

print("\n" + "="*80)