# Data Mining in RecordOwl (Silver 1)

In [None]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


### Ingesting from previous layer

In [None]:
parquet_path = "./Staging/Bronze/bronze_data_1.parquet"
if os.path.exists(parquet_path):
    acra_data_filtered_by_industry = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(acra_data_filtered_by_industry)} rows from {parquet_path}")
    print(acra_data_filtered_by_industry.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

### Mining RecordOwl

In [None]:

client = ApifyClient("apify_api_BKqgA7WLcQMD7dugx62KslGgbrxZ2t3NB2gj")

SOCIAL_MEDIA_DOMAINS = [
    "facebook.com", "linkedin.com", "instagram.com", "youtube.com",
    "tiktok.com", "twitter.com", "x.com", "pinterest.com"
]

def fetch_dataset_items_safe(dataset_client, max_retries=5, initial_wait=3):
    """Safely fetch dataset items with multiple retry strategies."""
    dataset_items = []
    
    for attempt in range(max_retries):
        try:
            # Strategy 1: Try using iterate_items() (streaming)
            try:
                dataset_items = list(dataset_client.iterate_items())
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)  # Exponential backoff
                    print(f"  ‚ö†Ô∏è Iteration method failed (attempt {attempt + 1}/{max_retries}), trying direct fetch in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ö†Ô∏è Iteration method failed after all retries, trying direct fetch...")
            
            # Strategy 2: Try using list_items() (direct pagination)
            try:
                offset = 0
                limit = 100
                while True:
                    page = dataset_client.list_items(offset=offset, limit=limit, clean=True)
                    if not page.items:
                        break
                    dataset_items.extend(page.items)
                    if len(page.items) < limit:
                        break
                    offset += limit
                
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)
                    print(f"  ‚ö†Ô∏è Direct fetch failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ùå All fetch methods failed: {e}")
                    return []
                    
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = initial_wait * (2 ** attempt)
                print(f"  ‚ö†Ô∏è Unexpected error (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"  ‚ùå Failed after all retries: {e}")
                return []
    
    return dataset_items

def run_apify_with_retry(client, run_input, uen, max_retries=3):
    """Run Apify with exponential backoff on 403 errors AND verify dataset has items."""
    for attempt in range(max_retries):
        try:
            print(f"  üì° Starting Apify run for {uen} (attempt {attempt + 1}/{max_retries})...")
            run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
            print(f"  ‚è≥ Waiting for run to complete...")
            run_client = client.run(run["id"])
            run_info = run_client.wait_for_finish()
            
            # CRITICAL FIX: Check if run actually scraped pages, not just if it "succeeded"
            if run_info and "status" in run_info:
                status = run_info.get("status")
                
                # Even if status is "SUCCEEDED", verify dataset actually has items
                if status == "SUCCEEDED" and "defaultDatasetId" in run:
                    # Quick check if dataset has any items
                    try:
                        dataset_check = client.dataset(run["defaultDatasetId"])
                        time.sleep(2)  # Brief wait for dataset to be ready
                        test_items = dataset_check.list_items(limit=1, clean=True)
                        
                        if test_items.items and len(test_items.items) > 0:
                            # Dataset has items - true success!
                            print(f"  ‚úÖ Run succeeded with data")
                            return run, None
                        else:
                            # Status says "SUCCEEDED" but dataset is EMPTY - this is a failure!
                            print(f"  ‚ö†Ô∏è Run completed but dataset is empty (likely 403 block)")
                            # Treat as 403 and retry
                            if attempt < max_retries - 1:
                                wait_time = 30 * (2 ** attempt)
                                print(f"  üîÑ Retrying in {wait_time}s...")
                                time.sleep(wait_time)
                                continue
                            else:
                                return None, "Dataset empty after all retries (403 blocking)"
                    except Exception as e:
                        print(f"  ‚ö†Ô∏è Could not verify dataset: {e}")
                        # If we can't check dataset, try to use the run anyway
                        return run, None
                
                elif status != "SUCCEEDED":
                    # Check error message for 403
                    error_msg = str(run_info)
                    if "403" in error_msg or "blocked" in error_msg.lower():
                        if attempt < max_retries - 1:
                            wait_time = 30 * (2 ** attempt)  # 30s, 60s, 120s
                            print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                            time.sleep(wait_time)
                            continue
            
            return run, None
            
        except Exception as e:
            error_str = str(e)
            if "403" in error_str or "blocked" in error_str.lower():
                if attempt < max_retries - 1:
                    wait_time = 30 * (2 ** attempt)
                    print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                    time.sleep(wait_time)
                    continue
            return None, f"Apify call failed: {str(e)}"
    
    return None, "Max retries exceeded due to 403 blocking"

all_results = []

for idx, (i, row) in enumerate(acra_data_filtered_by_industry.iterrows(), 1):
    uen = str(row["UEN"]).strip()
    print(f"\nüîé Processing {uen} ({idx}/{len(acra_data_filtered_by_industry)})")

    # Build pageFunction with proper escaping and improved error handling
    page_function = f"""
    async function pageFunction(context) {{
        const {{ page, log, request }} = context;
        const uen = "{uen}";
        log.info("Visiting RecordOwl for UEN: " + uen);

        try {{
            // Step 1: Wait for search input
            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ timeout: 30000 }});
            log.info("Search input found");
            
            // Step 2: Type UEN into search box with error handling and navigation protection
            try {{
                // Wait for page to be stable (no navigation happening)
                log.info("Waiting for page to stabilize...");
                await new Promise(r => setTimeout(r, 2000)); // Wait for any auto-navigation to complete
                
                // Wait for input to be present and stable
                await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ 
                    timeout: 30000,
                    visible: true 
                }});
                
                // Re-find input right before typing (in case page navigated)
                let input = await page.$("input[placeholder='Search company name, industry, or address']");
                if (!input) {{
                    log.error("Input element not found after wait");
                    return {{ status: 'error', uen, error: 'Input element not found' }};
                }}
                
                // Clear and type with retry logic
                let typed = false;
                for (let attempt = 0; attempt < 3; attempt++) {{
                    try {{
                        // Re-find input on each attempt (in case context was destroyed)
                        input = await page.$("input[placeholder='Search company name, industry, or address']");
                        if (!input) {{
                            throw new Error("Input not found on attempt " + (attempt + 1));
                        }}
                        
                        // Click to focus
                        await input.click({{ clickCount: 3 }});
                        await new Promise(r => setTimeout(r, 300)); // Small delay after click
                        
                        // Clear input first
                        await page.evaluate((selector) => {{
                            const el = document.querySelector(selector);
                            if (el) el.value = '';
                        }}, "input[placeholder='Search company name, industry, or address']");
                        
                        // Type UEN
                        await input.type(uen, {{ delay: 100 }});
                        typed = true;
                        log.info("UEN typed successfully: " + uen);
                        break;
                    }} catch (typeErr) {{
                        if (typeErr.message.includes("Execution context was destroyed") || 
                            typeErr.message.includes("navigation")) {{
                            log.info("Navigation occurred during typing (attempt " + (attempt + 1) + "/3), retrying...");
                            // Wait for page to stabilize after navigation
                            await new Promise(r => setTimeout(r, 2000));
                            // Re-wait for input
                            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ 
                                timeout: 10000,
                                visible: true 
                            }});
                            continue;
                        }} else {{
                            throw typeErr;
                        }}
                    }}
                }}
                
                if (!typed) {{
                    log.error("Failed to type UEN after all retries");
                    return {{ status: 'error', uen, error: 'Failed to type UEN after retries' }};
                }}
                
            }} catch (typeErr) {{
                log.error("Error typing UEN: " + typeErr.message);
                return {{ status: 'error', uen, error: 'Failed to type UEN: ' + typeErr.message }};
            }}

            // Step 3: Submit search with flexible waiting strategy
            try {{
                log.info("Clicking submit button...");
                
                // Click submit button first
                await page.click("button[type='submit']");
                log.info("Submit button clicked");
                
                // Wait for either navigation OR results to appear (more flexible)
                // Strategy: Wait for results to appear, with navigation as optional
                try {{
                    // Option 1: Wait for navigation (if it happens) - non-blocking
                    const navigationPromise = page.waitForNavigation({{ 
                        waitUntil: 'networkidle2', 
                        timeout: 30000 
                    }}).catch(() => {{
                        log.info("Navigation did not occur (may be client-side routing)");
                        return null;
                    }});
                    
                    // Option 2: Wait for results to appear (more reliable)
                    const resultsPromise = page.waitForSelector("a[href*='/company/']", {{ 
                        timeout: 60000 
                    }});
                    
                    // Wait for either navigation or results (whichever happens first)
                    await Promise.race([
                        navigationPromise,
                        resultsPromise
                    ]);
                    
                    // Give page time to stabilize
                    await new Promise(r => setTimeout(r, 2000));
                    log.info("Page stabilized after submit");
                    
                }} catch (waitErr) {{
                    // If both navigation and results wait failed, try one more time for results
                    log.info("Initial wait failed, trying again for results: " + waitErr.message);
                    try {{
                        await page.waitForSelector("a[href*='/company/']", {{ timeout: 30000 }});
                        log.info("Results found on retry");
                    }} catch (retryErr) {{
                        log.info("No company links found after submit, might be not found");
                        return {{ status: 'not_found', uen }};
                    }}
                }}
                
            }} catch (navErr) {{
                log.error("Error during submit: " + navErr.message);
                // Don't fail immediately - try to check if results are already there
                try {{
                    const hasResults = await page.$("a[href*='/company/']");
                    if (hasResults) {{
                        log.info("Results found despite submit error");
                    }} else {{
                        return {{ status: 'error', uen, error: 'Submit failed: ' + navErr.message }};
                    }}
                }} catch (checkErr) {{
                    return {{ status: 'error', uen, error: 'Submit failed: ' + navErr.message }};
                }}
            }}

            // Step 4: Verify search results are present
            log.info("Verifying company links are present...");
            try {{
                // Double-check that results are actually there
                await page.waitForSelector("a[href*='/company/']", {{ timeout: 10000 }});
                log.info("Company links confirmed");
            }} catch (e) {{
                log.info("No company links found, might be not found");
                return {{ status: 'not_found', uen }};
            }}

            // Step 5: Find the correct company link (in a new execution context after navigation)
            let companyLink;
            try {{
                companyLink = await page.evaluate((searchUen) => {{
                    const links = Array.from(document.querySelectorAll("a[href*='/company/']"));
                    
                    // Find link where UEN appears in text or URL
                    const uenUpper = searchUen.toUpperCase();
                    const uenLower = searchUen.toLowerCase();
                    
                    for (const a of links) {{
                        const text = (a.innerText || "").toUpperCase();
                        const href = (a.href || "").toLowerCase();
                        
                        // Check if UEN appears in text or URL (case-insensitive)
                        if (text.includes(uenUpper) || href.includes(uenLower)) {{
                            console.log("Found UEN match: " + a.href);
                            return a.href;
                        }}
                    }}
                    
                    // Fallback: Take first company link if available
                    if (links.length > 0) {{
                        console.log("No exact UEN match, using first link: " + links[0].href);
                        return links[0].href;
                    }}
                    
                    console.log("No company links found");
                    return null;
                }}, uen);
                
                if (!companyLink) {{
                    log.info("No company links found on results page");
                    return {{ status: 'not_found', uen }};
                }}
                log.info("Found company link: " + companyLink);
            }} catch (evalErr) {{
                log.error("Error finding company link: " + evalErr.message);
                return {{ status: 'error', uen, error: 'Failed to find company link: ' + evalErr.message }};
            }}

            // Step 6: Navigate to company page if not already there
            if (page.url() !== companyLink) {{
                try {{
                    log.info("Navigating to company page...");
                    await page.goto(companyLink, {{ 
                        waitUntil: 'networkidle2', 
                        timeout: 60000 
                    }});
                    log.info("Company page loaded");
                    
                    // Critical: Wait for page to fully stabilize
                    await new Promise(r => setTimeout(r, 5000));
                }} catch (gotoErr) {{
                    log.error("Error navigating to company page: " + gotoErr.message);
                    return {{ status: 'error', uen, error: 'Failed to load company page: ' + gotoErr.message }};
                }}
            }}

            // Step 7: Wait for content to load (with multiple fallback strategies)
            log.info("Waiting for page content...");
            try {{
                await Promise.race([
                    page.waitForSelector('dt', {{ timeout: 15000 }}),
                    page.waitForSelector('dl', {{ timeout: 15000 }}),
                    page.waitForSelector('.max-w-7xl', {{ timeout: 15000 }}),
                    new Promise(r => setTimeout(r, 10000)) // Fallback: just wait 10s
                ]);
                log.info("Content loaded");
            }} catch (contentErr) {{
                log.info("Content wait timeout, but continuing: " + contentErr.message);
            }}
            
            // Additional stabilization wait
            await new Promise(r => setTimeout(r, 3000));
            
            // Step 7.5: VERIFY we're on the correct company page
            log.info("Verifying UEN on company page...");
            try {{
                const pageUEN = await page.evaluate((searchUen) => {{
                    const pageText = (document.body.innerText || "").toUpperCase();
                    return pageText.includes(searchUen.toUpperCase());
                }}, uen);
                
                if (pageUEN) {{
                    log.info("‚úì UEN verified on page: " + uen);
                }} else {{
                    log.info("‚ö† Warning: UEN not found in page text, but continuing...");
                }}
            }} catch (verifyErr) {{
                log.info("Could not verify UEN, but continuing: " + verifyErr.message);
            }}
            
            // Step 8: Extract content (in stable context) - ONLY VISIBLE ELEMENTS
            let html_content, title, url;
            try {{
                // Get only the visible HTML content by removing hidden elements
                await page.evaluate(() => {{
                    // Remove all elements that are hidden from view
                    const allElements = document.querySelectorAll('*');
                    allElements.forEach(el => {{
                        const style = window.getComputedStyle(el);
                        // Mark hidden elements with a special attribute
                        if (style.display === 'none' || 
                            style.visibility === 'hidden' || 
                            style.opacity === '0' ||
                            el.hidden ||
                            el.hasAttribute('hidden')) {{
                            el.setAttribute('data-hidden-element', 'true');
                        }}
                    }});
                }});
                
                html_content = await page.content();
                title = await page.title();
                url = page.url();
                log.info("Successfully extracted HTML content (" + html_content.length + " chars)");
            }} catch (extractErr) {{
                log.error("Error extracting content: " + extractErr.message);
                return {{ status: 'error', uen, error: 'Failed to extract content: ' + extractErr.message }};
            }}

            return {{ status: 'success', uen, url, title, html_content }};
            
        }} catch (err) {{
            log.error("Unexpected error in pageFunction: " + err.message);
            log.error("Stack: " + err.stack);
            return {{ status: 'error', uen, error: err.message }};
        }}
    }}
    """

    run_input = {
        "startUrls": [{"url": "https://recordowl.com/"}],
        "useChrome": True,
        "headless": True,
        "stealth": True,
        "pageFunction": page_function,
        "ignoreSslErrors": False,
        "ignoreCorsAndCsp": False,
        "maxRequestRetries": 3,  # Increased retry attempts
        "maxRequestsPerCrawl": 1,  # One page per run
        "maxConcurrency": 1,  # No parallel requests
        "pageLoadTimeoutSecs": 90,  # Optimized timeout
        "pageFunctionTimeoutSecs": 180,  # 3 minutes for pageFunction
        "waitUntil": ["networkidle2"],  # Wait for network to be idle
        # OPTIMIZED: Residential proxies with recommended rotation
        "proxyConfiguration": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"],  # Residential IPs less likely to be blocked
        },
        "proxyRotation": "RECOMMENDED",  # Optimal proxy rotation strategy
    }

    # Use retry logic for 403 errors (5 attempts = more chances to recover)
    run, error = run_apify_with_retry(client, run_input, uen, max_retries=5)

    if error or not run:
        print(f"  ‚ùå Apify call failed for {uen}: {error}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": error or "No run returned"
        })
        time.sleep(10)  # Longer sleep after failure
        continue

    if not run or "defaultDatasetId" not in run:
        print(f"  ‚ö†Ô∏è No valid dataset returned for {uen}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": "No dataset returned"
        })
        continue

    # Wait for dataset to be ready with progressive checking
    print(f"  ‚è≥ Waiting for dataset to be ready...")
    time.sleep(5)  # Initial wait
    
    # Try to fetch dataset with progressive waits
    dataset_client = client.dataset(run["defaultDatasetId"])
    for check_attempt in range(3):
        try:
            # Quick check if dataset has items
            test_fetch = dataset_client.list_items(limit=1, clean=True)
            if test_fetch.items:
                break
        except:
            pass
        
        if check_attempt < 2:
            additional_wait = 3 * (check_attempt + 1)
            print(f"  ‚è≥ Dataset not ready, waiting {additional_wait}s more...")
            time.sleep(additional_wait)
    
    scraped_html, record_owl_url = None, None
    
    # Fetch dataset items with improved error handling
    dataset_items = fetch_dataset_items_safe(
        dataset_client,
        max_retries=5,
        initial_wait=5  # Increased from 3 to 5
    )
    
    # Process items
    if not dataset_items:
        print(f"  ‚ö†Ô∏è Dataset is empty - no items returned!")
    else:
        print(f"  üìä Dataset has {len(dataset_items)} item(s)")
    
    for item in dataset_items:
        if item.get("status") == "success":
            scraped_html = item.get("html_content", "")
            record_owl_url = item.get("url")
            if scraped_html:
                print(f"  ‚úÖ Successfully scraped {uen} ({len(scraped_html)} chars of HTML)")
            else:
                print(f"  ‚ö†Ô∏è Status is 'success' but html_content is empty for {uen}")
        elif item.get("status") == "not_found":
            print(f"  ‚ö†Ô∏è Company not found for UEN {uen}")
        elif item.get("status") == "error":
            print(f"  ‚ùå Error for {uen}: {item.get('error')}")
        else:
            print(f"  ‚ö†Ô∏è Unknown item status for {uen}: {item.get('status')}")
            print(f"  üìã Item keys: {list(item.keys())}")

    if not scraped_html:
        # Determine the specific reason for failure
        if not dataset_items:
            error_reason = "Dataset empty (likely 403 block at Apify level)"
            print(f"  ‚ùå {error_reason}")
        elif any(item.get("status") == "not_found" for item in dataset_items):
            error_reason = "Company not found on RecordOwl"
            print(f"  ‚ùå {error_reason}")
        elif any(item.get("status") == "error" for item in dataset_items):
            error_details = [item.get("error", "Unknown") for item in dataset_items if item.get("status") == "error"]
            error_reason = f"Scraping error: {error_details[0] if error_details else 'Unknown'}"
            print(f"  ‚ùå {error_reason}")
        else:
            error_reason = "No HTML content retrieved (unknown reason)"
            print(f"  ‚ö†Ô∏è {error_reason}")
            # Debug: show what's in dataset items
            if dataset_items:
                print(f"  üîç DEBUG - First item: {dataset_items[0]}")
        
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": error_reason
        })
        time.sleep(5)
        continue

    # Parse HTML
    try:
        soup = BeautifulSoup(scraped_html, "html.parser")
        
        # ========== CLEAN HTML: REMOVE HIDDEN/UNWANTED ELEMENTS ==========
        # Remove hidden elements
        for elem in soup.find_all(attrs={"data-hidden-element": "true"}):
            elem.decompose()
        
        # Target company overview (exclude officer/director personal data)
        overview_tab = (soup.select_one("#overview") or 
                       soup.select_one("[aria-labelledby*='overview']") or
                       soup.select_one("div[role='tabpanel']"))
        
        if overview_tab:
            parent = overview_tab
        else:
            parent = soup.select_one("div.max-w-7xl.mx-auto.lg\\:py-6.sm\\:px-6.lg\\:px-8")
            if parent:
                # Remove officer/shareholder sections
                for unwanted in parent.select("#officers, #shareholders, #appointments, "
                                             "[id*='officer'], [id*='shareholder'], [id*='appointment']"):
                    unwanted.decompose()
        
        # Remove non-visible content
        if parent:
            for unwanted in parent.select("script, style, noscript, [style*='display:none']"):
                unwanted.decompose()
        # ========== END CLEAN HTML ==========

        emails, phones, website = [], [], None
        facebook_links, linkedin_links, instagram_links, tiktok_links = [], [], [], []
        
        # Helper function to check if element is visible
        def is_element_visible(element):
            """Check if a BeautifulSoup element appears to be visible (not hidden)."""
            if element is None:
                return False
            # Check for hidden attribute
            if element.has_attr('data-hidden-element'):
                return False
            # Check for common hidden styles
            style = element.get('style', '')
            if any(hidden_style in style.lower() for hidden_style in ['display:none', 'display: none', 'visibility:hidden', 'visibility: hidden']):
                return False
            # Check for hidden/aria-hidden attributes
            if element.get('hidden') or element.get('aria-hidden') == 'true':
                return False
            return True

        if parent:
            # Extract emails
            for a in parent.select("a[href^=mailto]"):
                email = a.get("href", "").replace("mailto:", "").strip()
                if email and email not in emails and "@" in email:
                    emails.append(email)

            # ========== COMPREHENSIVE SINGAPORE PHONE EXTRACTION ==========
            # 
            # ‚úÖ HANDLES ALL POSSIBLE SINGAPORE PHONE NUMBER FORMATS:
            # 
            #    International formats with country code +65:
            #      ‚Ä¢ +65 6694 5996       (standard international)
            #      ‚Ä¢ +65-6694-5996       (with dashes)
            #      ‚Ä¢ +65.6694.5996       (with dots)
            #      ‚Ä¢ +656694 5996        (partial spacing)
            #      ‚Ä¢ +6566945996         (no spacing)
            #      ‚Ä¢ +65 66945996        (no spacing in local part)
            #    
            #    With parentheses:
            #      ‚Ä¢ (+65) 6694 5996     (parentheses with plus)
            #      ‚Ä¢ (65) 6694 5996      (parentheses without plus)
            #      ‚Ä¢ +65(6694)5996       (area code style)
            #      ‚Ä¢ +65 (6694) 5996     (with spaces)
            #    
            #    Local formats without country code:
            #      ‚Ä¢ 6566945996          (country code without plus, no spaces)
            #      ‚Ä¢ 65 6694 5996        (country code with spaces)
            #      ‚Ä¢ 65-6694-5996        (country code with dashes)
            #      ‚Ä¢ 6694 5996           (8 digits with space)
            #      ‚Ä¢ 66945996            (8 digits no space)
            #      ‚Ä¢ 6694-5996           (8 digits with dash)
            #      ‚Ä¢ 6694.5996           (8 digits with dot)
            #      ‚Ä¢ 669 45996           (odd spacing patterns)
            #    
            #    Any combination of separators (spaces, dashes, dots, parentheses, slashes)
            # 
            # ‚úÖ VALIDATION RULES:
            #    ‚Ä¢ Mobile numbers: Start with 8 or 9 (e.g., 8123 4567, 9123 4567)
            #    ‚Ä¢ Fixed line: Start with 6 (e.g., 6123 4567, 6694 5996)
            #    ‚Ä¢ Length: Exactly 8 digits (local) or 10 digits (with country code 65)
            #    ‚Ä¢ Country code: Singapore +65 only
            #    ‚Ä¢ Dynamically rejects 20+ non-Singapore country codes
            #    ‚Ä¢ Filters visible elements only (no hidden HTML data)
            #    ‚Ä¢ Excludes personal contacts (officers, directors, shareholders)
            # 
            # ‚ùå AUTOMATICALLY REJECTED:
            #    ‚Ä¢ Non-Singapore country codes: +60 (Malaysia), +62 (Indonesia), +63 (Philippines),
            #      +66 (Thailand), +81 (Japan), +82 (Korea), +84 (Vietnam), +86 (China), +91 (India), etc.
            #    ‚Ä¢ Numbers with < 8 or incorrect digit count
            #    ‚Ä¢ Numbers not starting with 6, 8, or 9 (after country code)
            #    ‚Ä¢ Hidden or non-visible HTML elements
            #    ‚Ä¢ Personal/officer contact information
            # 
            # üì§ OUTPUT FORMAT:
            #    All valid numbers are normalized to: +6512345678 (international format)
            
            def validate_sg_phone(digits_str):
                """
                Validate and format Singapore phone number from digit-only string.
                
                This function accepts a string with all separators already removed (only digits)
                and validates it against Singapore phone number rules. It dynamically rejects
                numbers from other countries and only accepts valid Singapore formats.
                
                Args:
                    digits_str (str): String containing only digits (e.g., "6566945996", "66945996")
                
                Returns:
                    str or None: Formatted phone number as "+6512345678" if valid, None otherwise
                
                Examples:
                    validate_sg_phone("6566945996")   -> "+6566945996"    (country code + 8 digits)
                    validate_sg_phone("66945996")     -> "+6566945996"    (8 digits, add country code)
                    validate_sg_phone("63378789")     -> "+6563378789"    (8 digits, NOT rejected as +63)
                    validate_sg_phone("60391312823")  -> None             (Malaysia +60, rejected)
                    validate_sg_phone("63123456789")  -> None             (Philippines +63, rejected)
                    validate_sg_phone("81234567")     -> "+6581234567"    (mobile number)
                """
                if not digits_str or len(digits_str) < 8:
                    return None
                
                # ========== DYNAMIC NON-SINGAPORE COUNTRY CODE BLACKLIST ==========
                # Comprehensive list of international country codes that are NOT Singapore
                # This prevents false positives from extracting parts of non-SG numbers
                non_sg_codes = [
                    # ASEAN Countries
                    "60",   # Malaysia
                    "62",   # Indonesia
                    "63",   # Philippines
                    "66",   # Thailand
                    "84",   # Vietnam
                    "95",   # Myanmar
                    "855",  # Cambodia
                    "856",  # Laos
                    "880",  # Bangladesh
                    
                    # East Asia
                    "81",   # Japan
                    "82",   # South Korea
                    "86",   # China
                    "852",  # Hong Kong
                    "853",  # Macau
                    "886",  # Taiwan
                    
                    # South Asia
                    "91",   # India
                    "92",   # Pakistan
                    "93",   # Afghanistan
                    "94",   # Sri Lanka
                    
                    # Oceania
                    "61",   # Australia
                    "64",   # New Zealand
                    
                    # Others
                    "90",   # Turkey
                    "98",   # Iran
                ]
                
                # ========== STEP 1: VALIDATE 8-DIGIT LOCAL FORMAT FIRST ==========
                # CRITICAL FIX: Check 8-digit local numbers BEFORE country code rejection
                # This prevents false positives like "6337 8789" being rejected as Philippines "+63"
                
                # FORMAT 1: Exactly 8 digits starting with 6/8/9 (local format, no country code)
                # Handles: 6694 5996, 66945996, 6337 8789, 6694-5996, 6694.5996, 669 45996, etc.
                # After stripping separators: 66945996, 63378789, etc.
                # Action: Add +65 country code prefix
                # NOTE: Must check this BEFORE country code rejection to avoid false positives
                if len(digits_str) == 8 and digits_str[0] in "689":
                    return "+65" + digits_str
                
                # ========== STEP 2: REJECT NON-SINGAPORE COUNTRY CODES ==========
                # For numbers with 9+ digits, reject if they start with non-SG country code
                # This prevents extraction of parts of foreign numbers like "+60 3-9131 2823"
                # NOTE: We skip this check for 8-digit numbers (handled above) to avoid false positives
                if len(digits_str) >= 9:
                    for code in non_sg_codes:
                        if digits_str.startswith(code):
                            return None  # Not a Singapore number
                
                # ========== STEP 3: VALIDATE 10-DIGIT INTERNATIONAL FORMAT ==========
                
                # FORMAT 2: Exactly 10 digits starting with 65 and third digit is 6/8/9
                # Handles: +65 6694 5996, +6566945996, 6566945996, 65-6694-5996, etc.
                # After stripping separators: 6566945996
                if len(digits_str) == 10 and digits_str.startswith("65") and digits_str[2] in "689":
                    return "+" + digits_str
                    
                # FORMAT 3: More than 10 digits - search for valid SG pattern within string
                # This handles edge cases where phone number might be concatenated with other digits
                # Example: "Contact: 6566945996 or email" -> digits: "6566945996" 
                elif len(digits_str) > 10:
                    # Search for the pattern "65" followed by valid SG local number (6/8/9...)
                    for i in range(len(digits_str) - 9):
                        if digits_str[i:i+2] == "65" and digits_str[i+2] in "689":
                            # EXTRA VALIDATION: Ensure "65" is not part of another country code
                            # For example, in "865...", the "65" might be part of Cambodia +855
                            if i > 0:
                                # Check if preceding digits form part of a blacklisted code
                                prev_digits = digits_str[max(0, i-2):i]
                                is_part_of_other_code = any(
                                    code.endswith(prev_digits + "65") 
                                    for code in non_sg_codes
                                )
                                if is_part_of_other_code:
                                    continue  # Skip this match, it's part of another country code
                            
                            # Valid Singapore number found
                            return "+" + digits_str[i:i+10]
                
                # ========== STEP 3: REJECT ALL OTHER CASES ==========
                # Don't attempt to force-extract or guess
                # No "last 8 digits" fallback to prevent false positives
                return None
            
            # Method 1: Extract from tel: links (most reliable)
            tel_links = [link for link in parent.select("a[href^='tel:'], a[href^='tel']") 
                        if is_element_visible(link)]
            
            for a in tel_links:
                tel_href = a.get("href", "").replace("tel:", "").strip()
                # Extract digits and validate (validation function handles rejection)
                digits_only = re.sub(r"\D", "", tel_href)
                formatted = validate_sg_phone(digits_only)
                if formatted and formatted not in phones:
                    phones.append(formatted)
            
            # Method 2: Extract from dt/dd structure (company info fields)
            company_keywords = ["company contact", "business contact", "office phone", 
                              "main phone", "business phone", "company phone", "contact number", 
                              "phone", "tel", "mobile", "call", "contact no"]
            exclude_keywords = ["officer", "charge", "employee", "shareholder", "director", 
                              "registration", "person", "individual", "member", "partner",
                              "manager", "owner", "proprietor", "authorized", "representative",
                              "appointment", "designation", "name of", "appointed"]
            
            visible_dt_tags = [dt for dt in parent.select("dt") if is_element_visible(dt)]
            
            for dt in visible_dt_tags:
                dt_text = dt.get_text(strip=True).lower()
                
                # Only extract company-level contacts (not personal)
                is_company = any(kw in dt_text for kw in company_keywords)
                is_personal = any(excl in dt_text for excl in exclude_keywords)
                
                if is_company and not is_personal:
                    dd = dt.find_next_sibling("dd")
                    if dd and is_element_visible(dd):
                        number_text = dd.get_text(" ", strip=True)
                        # Extract digits and validate (validation function handles rejection)
                        all_digits = re.sub(r"\D", "", number_text)
                        formatted = validate_sg_phone(all_digits)
                        if formatted and formatted not in phones:
                            phones.append(formatted)
            
            # Method 3: Fallback text search (only if no phones found via structured data)
            # This method uses comprehensive regex patterns to find phone numbers in free text
            if not phones:
                full_text = parent.get_text()
                
                # COMPREHENSIVE REGEX PATTERNS for all Singapore phone number formats
                # Each pattern handles different separator combinations (spaces, dashes, dots, parentheses)
                sg_patterns = [
                    # Pattern 1: International format with + and any separators
                    # Matches: +65 6694 5996, +65-6694-5996, +65.6694.5996, +656694 5996, +6566945996
                    r"\+[\s\-\.]*65[\s\-\.]*[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d",
                    
                    # Pattern 2: Parentheses format (+65) or (65)
                    # Matches: (+65) 6694 5996, (65) 6694 5996, (+65)66945996, (65)6694-5996
                    r"\([\s\-\.]*\+?[\s\-\.]*65[\s\-\.]*\)[\s\-\.]*[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d",
                    
                    # Pattern 3: Country code in middle with parentheses
                    # Matches: +65(6694)5996, +65 (6694) 5996, 65(6694)5996
                    r"\+?[\s\-\.]*65[\s\-\.]*\([\s\-\.]*[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\)[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d",
                    
                    # Pattern 4: Country code without + (with mandatory separator to avoid false matches)
                    # Matches: 65 6694 5996, 65-6694-5996, 65.6694.5996
                    # Uses negative lookbehind/lookahead to ensure not part of longer number
                    r"(?<!\d)65[\s\-\.]+[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d(?!\d)",
                    
                    # Pattern 5: 8-digit local format (with any separators)
                    # Matches: 6694 5996, 66945996, 6694-5996, 6694.5996, 669 45996
                    # Uses negative lookbehind/lookahead to avoid matching parts of longer numbers
                    r"(?<!\d)[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d(?!\d)",
                    
                    # Pattern 6: Country code without separator (edge case)
                    # Matches: 6566945996 (but only if preceded/followed by non-digit)
                    r"(?<!\d)65[689]\d{7}(?!\d)",
                ]
                
                for pattern in sg_patterns:
                    matches = re.findall(pattern, full_text)
                    for match in matches:
                        # Strip all non-digit characters for validation
                        digits = re.sub(r"\D", "", match)
                        # Validate using our comprehensive validation function
                        formatted = validate_sg_phone(digits)
                        if formatted and formatted not in phones:
                            phones.append(formatted)
            # ========== END PHONE EXTRACTION ==========

            # Extract website
            valid_websites = []
            for a in parent.select("a[href^=http]"):
                href = a.get("href", "").strip()
                href_lower = href.lower()
                if not any(domain in href_lower for domain in SOCIAL_MEDIA_DOMAINS):
                    if not any(skip in href_lower for skip in ["recordowl", "apify.com"]):
                        if any(tld in href for tld in [".com", ".sg", ".net", ".org", ".co"]):
                            valid_websites.append(href)
            website = valid_websites[0] if valid_websites else None

        # Extract social media links from entire page
        for a in soup.find_all("a", href=True):
            href = a["href"].strip().lower()
            if "facebook.com" in href and href not in facebook_links:
                facebook_links.append(href)
            elif "linkedin.com" in href and href not in linkedin_links:
                linkedin_links.append(href)
            elif "instagram.com" in href and href not in instagram_links:
                instagram_links.append(href)
            elif "tiktok.com" in href and href not in tiktok_links:
                tiktok_links.append(href)

        all_results.append({
            "UEN": uen,
            "Emails": emails if emails else None,
            "Phones": phones if phones else None,
            "Website": website,
            "Facebook": list(set(facebook_links)) if facebook_links else None,
            "LinkedIn": list(set(linkedin_links)) if linkedin_links else None,
            "Instagram": list(set(instagram_links)) if instagram_links else None,
            "TikTok": list(set(tiktok_links)) if tiktok_links else None,
            "RecordOwl_Link": record_owl_url,
        })
        
        # Print extraction results with actual phone numbers
        if phones:
            phone_list = ", ".join(phones)
            print(f"  ‚úÖ Extracted: {len(emails) if emails else 0} email(s), {len(phones)} phone(s): {phone_list}")
        else:
            print(f"  ‚úÖ Extracted: {len(emails) if emails else 0} email(s), Phone: None found")
        
    except Exception as e:
        print(f"  ‚ùå Error parsing HTML for {uen}: {e}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": f"HTML parsing error: {str(e)}"
        })

    # Dynamic sleep time to avoid rate limiting and 403 blocks
    # Longer delays reduce detection and blocking
    base_sleep = 20  # Increased from 10
    random_addition = (idx % 10) + 5  # 5-14 seconds random
    sleep_time = base_sleep + random_addition  # 25-34 seconds total

    print(f"  üí§ Sleeping for {sleep_time}s before next request...")
    time.sleep(sleep_time)

    # Extra delay after every 5th request to further avoid detection
    if idx % 5 == 0:
        extra_wait = 30
        print(f"  üõë Checkpoint pause: waiting extra {extra_wait}s...")
        time.sleep(extra_wait)

New_Fresh_Leads = pd.DataFrame(all_results)
print("\n‚úÖ Scraping complete!")
print(f"\nüìä Results summary:")
print(f"   Total processed: {len(New_Fresh_Leads)}")
print(f"   With emails: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"   With phones: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"   With websites: {New_Fresh_Leads['Website'].notna().sum()}")

New_Fresh_Leads.head(10)