In [1]:
# Standard library
import os
import glob
import re
import time
import asyncio

# Third-party HTTP / async
import requests
import aiohttp
import nest_asyncio

# Data & analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Web scraping
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup

# Fuzzy matching
from fuzzywuzzy import fuzz, process

# Apify
from apify_client import ApifyClient


In [None]:
# STB TRAVEL AGENT SCRAPER - COST OPTIMIZED
from apify_client import ApifyClient
import pandas as pd
from datetime import datetime

client = ApifyClient("")

# ============================================================
# CONFIGURATION (COST OPTIMIZED)
# ============================================================
MAX_CONCURRENCY = 3          # Reduced for stability and lower memory
MAX_RETRIES = 2              # Retries per failed page
BATCH_SIZE = 50              # Increased - fewer actor runs = less startup overhead
TOTAL_PAGES = 100             # Total pages to scrape (adjust based on site)
START_PAGE = 0               # Starting page index

def create_stb_pagefunction_optimized() -> str:
    """OPTIMIZED: Removed fixed delays, uses fallback selectors"""
    return """
async function pageFunction(context) {
    const { page, log, request } = context;
    const pageIndex = request?.userData?.pageIndex ?? 0;
    
    try {
        // Wait for cards with fallback selectors (no fixed delays)
        try {
            await page.waitForSelector('.box-list.grid', { timeout: 20000 });
        } catch (e) {
            // Fallback: try waiting for individual card elements
            await page.waitForSelector('.name', { timeout: 10000 });
        }
        
        // Extract all travel agents
        const agents = await page.evaluate(() => {
            const results = [];
            
            // Select all company cards
            const cards = document.querySelectorAll('.box-list.grid');
            
            cards.forEach((card) => {
                try {
                    // Company Name - clean text only
                    const nameEl = card.querySelector('.name');
                    const companyName = nameEl ? nameEl.textContent.trim() : null;
                    
                    // Address
                    const addressEl = card.querySelector('.address');
                    const address = addressEl ? addressEl.textContent.trim() : null;
                    
                    // License ID
                    const licenseEl = card.querySelector('.license');
                    const licenseId = licenseEl ? licenseEl.textContent.trim() : null;
                    
                    // License Type
                    const typeEl = card.querySelector('.license_type');
                    const licenseType = typeEl ? typeEl.textContent.trim() : null;
                    
                    // Phone - from tel: link (FIXED: decode URL-encoded characters like %20)
                    let phone = null;
                    const phoneLink = card.querySelector('a[href^="tel:"]');
                    if (phoneLink) {
                        // Decode URL encoding (%20 -> space) then extract digits
                        const rawPhone = decodeURIComponent(phoneLink.href.replace('tel:', '')).trim();
                        const digits = rawPhone.replace(/\\D/g, '');
                        if (digits.length === 8) {
                            phone = '+65' + digits;
                        } else {
                            phone = '+65' + digits;
                        }
                    }
                    
                    // Email - from mailto: link
                    let email = null;
                    const emailLink = card.querySelector('a[href^="mailto:"]');
                    if (emailLink) {
                        email = emailLink.href.replace('mailto:', '').trim().toLowerCase();
                    }
                    
                    // Website - links starting with // or http (excluding stb.gov.sg)
                    let website = null;
                    const allLinks = card.querySelectorAll('a[href^="//"], a[href^="http"]');
                    for (const link of allLinks) {
                        const href = link.getAttribute('href');
                        if (href && !href.includes('stb.gov.sg')) {
                            website = href.startsWith('//') ? 'https:' + href : href;
                            break;
                        }
                    }
                    
                    // Only add if we have a company name
                    if (companyName) {
                        results.push({
                            company_name: companyName,
                            address: address,
                            email: email,
                            phone: phone,
                            website: website,
                            license_id: licenseId,
                            license_type: licenseType
                        });
                    }
                } catch (e) {
                    // Skip card on error
                }
            });
            
            return results;
        });
        
        return { 
            status: 'success', 
            pageIndex, 
            agents, 
            count: agents.length
        };
        
    } catch (err) {
        return { status: 'error', pageIndex, error: err.message, agents: [] };
    }
}
"""

def generate_page_urls(start_page: int, end_page: int) -> list:
    """Generate URLs for a range of pages"""
    base_url = "https://trust.stb.gov.sg/site/content/tagaem/landing-page/travel-agent.html"
    return [
        {"url": f"{base_url}?service=ALL&type=ALL&status=TA_A&curIndex={i}", 
         "userData": {"pageIndex": i}}
        for i in range(start_page, end_page)
    ]

def run_batch(client, start_urls: list, batch_num: int) -> tuple:
    """Run a batch of pages through Apify with concurrency - COST OPTIMIZED"""
    
    run_input = {
        "startUrls": start_urls,
        "useChrome": False,
        "headless": True,
        "stealth": False,
        "pageFunction": create_stb_pagefunction_optimized(),
        "maxRequestRetries": MAX_RETRIES,
        "maxRequestsPerCrawl": len(start_urls),
        "maxConcurrency": MAX_CONCURRENCY,
        "memoryMbytes": 2048,
        "pageLoadTimeoutSecs": 30,
        "pageFunctionTimeoutSecs": 60,
        "waitUntil": ["domcontentloaded"],
        "proxyConfiguration": {"useApifyProxy": True},
    }
    
    print(f"\n[Batch {batch_num}] Starting {len(start_urls)} pages with concurrency={MAX_CONCURRENCY}")
    
    try:
        run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
        run_client = client.run(run["id"])
        run_info = run_client.wait_for_finish()
        
        print(f"[Batch {batch_num}] Actor status: {run_info.get('status')}")
        
        if run_info.get('status') == "SUCCEEDED":
            dataset = client.dataset(run["defaultDatasetId"])
            items = list(dataset.iterate_items())
            return items, None
        return [], f"Failed: {run_info.get('status')}"
    except Exception as e:
        return [], str(e)

def run_full_scrape(client, start_page: int, total_pages: int, batch_size: int):
    """Run full scrape with batching"""
    
    all_agents = []
    failed_pages = []
    batch_num = 0
    
    end_page = start_page + total_pages
    
    for batch_start in range(start_page, end_page, batch_size):
        batch_num += 1
        batch_end = min(batch_start + batch_size, end_page)
        
        print(f"\n{'='*50}")
        print(f"BATCH {batch_num}: Pages {batch_start} to {batch_end - 1}")
        print(f"{'='*50}")
        
        start_urls = generate_page_urls(batch_start, batch_end)
        items, error = run_batch(client, start_urls, batch_num)
        
        if error:
            print(f"[Batch {batch_num}] ERROR: {error}")
            failed_pages.extend(range(batch_start, batch_end))
        else:
            batch_agents = 0
            for item in items:
                if item.get('status') == 'success':
                    agents = item.get('agents', [])
                    all_agents.extend(agents)
                    batch_agents += len(agents)
                else:
                    page_idx = item.get('pageIndex', 'unknown')
                    print(f"  Page {page_idx} failed: {item.get('error', 'unknown')}")
                    if page_idx != 'unknown':
                        failed_pages.append(page_idx)
            
            print(f"[Batch {batch_num}] Extracted {batch_agents} agents")
        
        print(f"Running total: {len(all_agents)} agents")
    
    return all_agents, failed_pages


# ============================================================
# RUN FULL SCRAPE
# ============================================================

print("=" * 60)
print("STB TRAVEL AGENT SCRAPER - COST OPTIMIZED")
print("=" * 60)
print(f"Configuration:")
print(f"  - Start page: {START_PAGE}")
print(f"  - Total pages: {TOTAL_PAGES}")
print(f"  - Batch size: {BATCH_SIZE}")
print(f"  - Concurrency: {MAX_CONCURRENCY}")
print(f"  - Total batches: {(TOTAL_PAGES + BATCH_SIZE - 1) // BATCH_SIZE}")
print(f"  - Memory: 2048 MB (optimized)")
print(f"  - Proxy: Default Apify proxy")

start_time = datetime.now()

all_agents, failed_pages = run_full_scrape(client, START_PAGE, TOTAL_PAGES, BATCH_SIZE)

end_time = datetime.now()
duration = end_time - start_time

# ============================================================
# RESULTS - SAVE TO DATAFRAME
# ============================================================
print("\n" + "=" * 60)
print("SCRAPE COMPLETE")
print("=" * 60)
print(f"Total agents extracted: {len(all_agents)}")
print(f"Failed pages: {len(failed_pages)}")
print(f"Duration: {duration}")

if failed_pages:
    print(f"Failed page indices: {failed_pages}")

if all_agents:
    # Create DataFrame and remove duplicates
    stb_travel_agents_df = pd.DataFrame(all_agents)
    stb_travel_agents_df = stb_travel_agents_df.drop_duplicates(
        subset=['company_name', 'license_id'], 
        keep='first'
    ).reset_index(drop=True)
    
    print(f"Unique agents (after dedup): {len(stb_travel_agents_df)}")
else:
    stb_travel_agents_df = pd.DataFrame()
    print("No data extracted")


STB TRAVEL AGENT SCRAPER - COST OPTIMIZED
Configuration:
  - Start page: 0
  - Total pages: 100
  - Batch size: 50
  - Concurrency: 3
  - Total batches: 2
  - Memory: 2048 MB (optimized)
  - Proxy: Default Apify proxy

BATCH 1: Pages 0 to 49

[Batch 1] Starting 50 pages with concurrency=3


[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> Status: RUNNING, Message: Starting the crawler.
[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> 2026-01-26T03:31:46.538Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> 2026-01-26T03:31:46.540Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> 2026-01-26T03:31:46.579Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> 2026-01-26T03:31:46.791Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:uppqKzMLHTbSbs1eS][0m -> 2026-01-26T03:31:47.551Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","cr

[Batch 1] Actor status: SUCCEEDED
[Batch 1] Extracted 750 agents
Running total: 750 agents

BATCH 2: Pages 50 to 99

[Batch 2] Starting 50 pages with concurrency=3


[36m[apify.puppeteer-scraper runId:vkddUEUXQEfFUxnvv][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:vkddUEUXQEfFUxnvv][0m -> 2026-01-26T03:34:13.025Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:vkddUEUXQEfFUxnvv][0m -> 2026-01-26T03:34:13.027Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:vkddUEUXQEfFUxnvv][0m -> 2026-01-26T03:34:13.068Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:vkddUEUXQEfFUxnvv][0m -> 2026-01-26T03:34:13.399Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:vkddUEUXQEfFUxnvv][0m -> 2026-01-26T03:34:14.372Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:vkd

[Batch 2] Actor status: SUCCEEDED
  Page 50 failed: Waiting for selector `.name` failed
  Page 84 failed: Waiting for selector `.name` failed
  Page 86 failed: Waiting for selector `.name` failed
  Page 87 failed: Waiting for selector `.name` failed
  Page 88 failed: Waiting for selector `.name` failed
  Page 89 failed: Waiting for selector `.name` failed
  Page 90 failed: Waiting for selector `.name` failed
  Page 91 failed: Waiting for selector `.name` failed
  Page 92 failed: Waiting for selector `.name` failed
  Page 93 failed: Waiting for selector `.name` failed
  Page 94 failed: Waiting for selector `.name` failed
  Page 97 failed: Waiting for selector `.name` failed
  Page 96 failed: Waiting for selector `.name` failed
  Page 98 failed: Waiting for selector `.name` failed
  Page 99 failed: Waiting for selector `.name` failed
  Page 85 failed: Waiting for selector `.name` failed
  Page 95 failed: Waiting for selector `.name` failed
  Page 96 failed: Waiting for selector `.name` f

In [3]:
stb_travel_agents_df

Unnamed: 0,company_name,address,email,phone,website,license_id,license_type
0,GLOBE TRAVEL PTE. LTD.,111 NORTH BRIDGE ROAD #07-09 PENINSULA PLAZA S...,globetrvlsg@gmail.com,+6588047861,,TA03459,General
1,GLOBETROTTERS' PREMIUM TRAVEL CLUB PTE. LTD.,2 SEMBAWANG WALK #01-49 SPRINGHILL SINGAPORE 7...,globetrotter.premium.travel.club@gmail.com,+6580570586,,TA03864,General
2,GLOBYA PTE. LTD.,703 HOUGANG AVENUE 2 #12-199 SINGAPORE 530703,globya.sg@gmail.com,+6591992302,https://www.globya.info%20,TA03519,General
3,GLOREE TOURS AND TRAVELS PTE. LTD.,50 CHIN SWEE ROAD #06-04 THONG CHAI BUILDING S...,info@gloree.com,+6568444666,https://www.gloree.com,TA01867,General
4,GLORY TRAVEL AND TOURS PTE. LTD.,101 UPPER CROSS STREET #04-48 PEOPLE'S PARK CE...,mrahim@glorytravelsg.com,+6591070573,,TA03786,General
...,...,...,...,...,...,...,...
1232,SUPREME TRAVEL & TOURS PTE. LTD.,60 EU TONG SEN STREET #03-05 FURAMA CITY CENTR...,sales@supremetravel.com.sg,+6597888044,,TA01664,General
1233,SUST TRAVEL PTE. LTD.,25 DAIRY FARM ROAD #03-01 SINGAPORE 679047,daniel@susttravel.com,+6587818263,,TA03675,General
1234,SWISH TRAVEL PTE. LTD.,101 UPPER CROSS STREET #06-09 PEOPLE'S PARK CE...,admin@swish.sg,+6565385557,,TA03218,General
1235,SYUKRAN TRAVEL PTE. LTD.,101 JOO CHIAT ROAD #03-01/02 GV BUILDING SINGA...,admin@syukrantravel.com,+6569048436,,TA03183,General


In [5]:
stb_travel_agents_df.to_csv('stb_travel_agents.csv', index=False)

In [6]:
stb_travel_agents_df.shape 

(1237, 7)

In [4]:
# # CHECK TOTAL PAGES - IMPROVED VERSION
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import re
# import time

# def check_stb_total_pages_v2():
#     """Check total pages - improved version with more detection methods"""
    
#     url = "https://trust.stb.gov.sg/site/content/tagaem/landing-page/travel-agent.html?service=ALL&type=ALL&status=TA_A&curIndex=0"
    
#     print("Launching browser...")
    
#     options = Options()
#     options.add_argument('--headless')
#     options.add_argument('--disable-gpu')
#     options.add_argument('--no-sandbox')
    
#     driver = webdriver.Chrome(options=options)
    
#     try:
#         driver.get(url)
        
#         # Wait for page to load
#         WebDriverWait(driver, 20).until(
#             EC.presence_of_element_located((By.CSS_SELECTOR, '.box-list.grid'))
#         )
        
#         # Scroll to bottom to trigger lazy-loaded pagination
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(2)
        
#         # Count cards
#         cards = driver.find_elements(By.CSS_SELECTOR, '.box-list.grid')
#         cards_count = len(cards)
        
#         # Get full page HTML and text
#         page_html = driver.page_source
#         page_text = driver.find_element(By.TAG_NAME, 'body').text
        
#         print(f"\n{'='*50}")
#         print("PAGINATION INFO:")
#         print(f"{'='*50}")
#         print(f"Cards on first page: {cards_count}")
        
#         # Method 1: Look for any number patterns in text
#         # Common patterns: "1500 results", "Page 1 of 100", "Showing 1-15 of 1500"
#         patterns = [
#             r'(\d[\d,]*)\s*(?:results?|records?|entries|items|agents?)',
#             r'page\s*\d+\s*of\s*(\d+)',
#             r'showing\s*\d+\s*-\s*\d+\s*of\s*([\d,]+)',
#             r'total[:\s]*(\d[\d,]*)',
#         ]
        
#         for pattern in patterns:
#             match = re.search(pattern, page_text, re.IGNORECASE)
#             if match:
#                 total = int(match.group(1).replace(',', ''))
#                 print(f"Found total: {total} (pattern: {pattern[:30]}...)")
#                 if cards_count > 0:
#                     estimated_pages = (total + cards_count - 1) // cards_count
#                     print(f"Estimated pages: {estimated_pages}")
#                 break
        
#         # Method 2: Look for pagination elements
#         pagination_selectors = [
#             '.pagination a',
#             '.pager a', 
#             'a[href*="curIndex"]',
#             'a[href*="page="]',
#             'button[data-page]',
#             '.page-numbers',
#             'nav[aria-label*="pagination"] a',
#             'ul.pagination li a'
#         ]
        
#         max_page = 0
#         for selector in pagination_selectors:
#             try:
#                 elements = driver.find_elements(By.CSS_SELECTOR, selector)
#                 for el in elements:
#                     text = el.text.strip()
#                     href = el.get_attribute('href') or ''
                    
#                     # Check text for numbers
#                     if text.isdigit():
#                         max_page = max(max_page, int(text))
                    
#                     # Check href for page indices
#                     for param in ['curIndex', 'page', 'p']:
#                         match = re.search(rf'{param}[=:](\d+)', href)
#                         if match:
#                             max_page = max(max_page, int(match.group(1)))
#             except:
#                 pass
        
#         if max_page > 0:
#             print(f"Max page from pagination: {max_page}")
        
#         # Method 3: Check for "next" or "last" buttons
#         last_selectors = ['a[title*="last"]', 'a[aria-label*="last"]', '.last a', 'a:contains("»")']
#         for selector in last_selectors:
#             try:
#                 el = driver.find_element(By.CSS_SELECTOR, selector)
#                 href = el.get_attribute('href') or ''
#                 match = re.search(r'curIndex=(\d+)', href)
#                 if match:
#                     print(f"Last page index: {match.group(1)}")
#             except:
#                 pass
        
#         # Method 4: Print sample of page text for manual inspection
#         print(f"\n--- Sample page text (first 500 chars) ---")
#         print(page_text[:500])
        
#         return {"cards_per_page": cards_count}
        
#     finally:
#         driver.quit()
#         print("\n\nBrowser closed.")

# # Run
# pagination_info = check_stb_total_pages_v2()
