In [25]:
import pandas as pd
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from helpers import RateLimiter, detect_endpoint_for_url
TARGET_REGISTRY_PATH='./data/avature_tenants.csv'
OUTPUT_JSON='./data/site_configs_raw.json'
OUTPUT_CSV='./data/site_configs.csv'

In [26]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [27]:
# Configuration
MAX_WORKERS=10
REQUESTS_PER_SECOND = 20  # Global rate limit

In [28]:
progress_lock = Lock()
progress_counter = {'processed': 0, 'success': 0, 'partial': 0, 'failed': 0}

In [29]:
# Initialize rate limiter
rate_limiter = RateLimiter(REQUESTS_PER_SECOND)

In [30]:
def process_row_with_progress(args, rate_limiter):
    """Wrapper for thread execution with progress tracking"""
    idx, row, total = args
    
    result = detect_endpoint_for_url(
        tenant=row['tenant'],
        url=row['full_url'],
        is_career_page=row['is_career_page'],
        rate_limiter=rate_limiter
    )
    
    # Update progress (thread-safe)
    with progress_lock:
        progress_counter['processed'] += 1
        if result['status'] == 'success':
            progress_counter['success'] += 1
            logger.info(f"[{progress_counter['processed']}/{total}] "
                       f"{result['tenant']}: {result['type']}")
        elif result['status'] == 'partial':
            progress_counter['partial'] += 1
            logger.info(f"[{progress_counter['processed']}/{total}] "
                       f"{result['tenant']}: {result['type']}")
        else:
            progress_counter['failed'] += 1
        
        if progress_counter['processed'] % 50 == 0:
            p = progress_counter['processed']
            s = progress_counter['success']
            logger.info(f"Progress: {p}/{total} | Success: {s} ({s/p*100:.1f}%)")
    
    return result

In [31]:
# Load tenant registry
logger.info("Loading tenant registry...")
df_tenants = pd.read_csv(TARGET_REGISTRY_PATH)

# Prioritize career pages first
df_career = df_tenants[df_tenants['is_career_page'] == True].copy()
total=len(df_career)

logger.info(f"Total URLs: {len(df_tenants)}")
logger.info(f"Career pages: {len(df_career)}")
logger.info(f"Other pages: {len(df_other)}")

# Start with career pages (highest yield)
logger.info("\n" + "="*60)
logger.info("PHASE 1: Detecting endpoints for career pages")
logger.info("="*60 + "\n")

start_time=time.time()
# Prepare arguments for parallel processing
tasks = [(idx, row, total) for idx, row in df_career.iterrows()]

# Execute in parallel
configs = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_row_with_progress, task,rate_limiter) for task in tasks]
    
    for future in as_completed(futures):
        try:
            result = future.result()
            configs.append(result)
        except Exception as e:
            logger.error(f"Thread exception: {e}")

elapsed = time.time() - start_time

#Convert to DataFrame
df_configs = pd.DataFrame(configs)
# Save intermediate results
df_configs.to_json('./data/site_configs_raw.json', orient='records', indent=2)
df_configs.to_csv('./data/site_configs.csv', index=False)

2026-01-31 21:15:50,645 - INFO - Loading tenant registry...
2026-01-31 21:15:50,657 - INFO - Total URLs: 2398
2026-01-31 21:15:50,659 - INFO - Career pages: 707
2026-01-31 21:15:50,659 - INFO - Other pages: 1691
2026-01-31 21:15:50,661 - INFO - 
2026-01-31 21:15:50,662 - INFO - PHASE 1: Detecting endpoints for career pages

2026-01-31 21:15:52,672 - INFO - [16/707] ally.avature.net: search_jobs_html
2026-01-31 21:15:52,722 - INFO - [17/707] amerilife.avature.net: search_jobs_html
2026-01-31 21:15:52,831 - INFO - [18/707] a2milkkf.avature.net: search_jobs_html
2026-01-31 21:15:53,185 - INFO - [24/707] advocateaurorahealth.avature.net: search_jobs_html
2026-01-31 21:15:53,774 - INFO - [26/707] amswh.avature.net: search_jobs_html
2026-01-31 21:15:54,941 - INFO - [29/707] astellas.avature.net: search_jobs_html
2026-01-31 21:15:55,470 - INFO - [31/707] astellasjapan.avature.net: search_jobs_html
2026-01-31 21:15:55,896 - INFO - [32/707] arcbest.avature.net: html_scrape
2026-01-31 21:15:56,3

In [32]:
# Print summary
logger.info("\n" + "="*60)
logger.info("DETECTION SUMMARY")
logger.info("="*60)

total = len(df_configs)
success = len(df_configs[df_configs['status'] == 'success'])
partial = len(df_configs[df_configs['status'] == 'partial'])
failed = len(df_configs[df_configs['status'] == 'failed'])

logger.info(f"\nTotal URLs processed: {total}")
logger.info(f"Success: {success} ({success/total*100:.1f}%)")
logger.info(f"Partial: {partial} ({partial/total*100:.1f}%)")
logger.info(f"Failed: {failed} ({failed/total*100:.1f}%)")

2026-01-31 21:17:41,002 - INFO - 
2026-01-31 21:17:41,003 - INFO - DETECTION SUMMARY
2026-01-31 21:17:41,008 - INFO - 
Total URLs processed: 707
2026-01-31 21:17:41,009 - INFO - Success: 178 (25.2%)
2026-01-31 21:17:41,009 - INFO - Partial: 9 (1.3%)
2026-01-31 21:17:41,010 - INFO - Failed: 520 (73.6%)


In [33]:
if success > 0:
    logger.info("\nEndpoint Types Detected:")
    type_counts = df_configs[df_configs['status'] == 'success']['type'].value_counts()
    for endpoint_type, count in type_counts.items():
        logger.info(f"{endpoint_type}: {count}")

2026-01-31 21:17:41,021 - INFO - 
Endpoint Types Detected:
2026-01-31 21:17:41,025 - INFO - search_jobs_html: 178
