# Data Mining in RecordOwl (Silver 1)

In [1]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
import random
import textwrap
from copy import deepcopy
from apify_client import ApifyClient
from urllib.error import HTTPError
from requests.exceptions import ConnectionError, RequestException


### Ingesting from previous layer

In [2]:
parquet_path = "./Staging/Bronze/bronze_data_1.parquet"
if os.path.exists(parquet_path):
    acra_data_filtered_by_industry = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(acra_data_filtered_by_industry)} rows from {parquet_path}")
    print(acra_data_filtered_by_industry.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

Loaded 10 rows from ./Staging/Bronze/bronze_data_1.parquet
(10, 14)


In [3]:
acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,53431824W,TUTORSVILLE.SG,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-04-2021,85509,na,COMPASSVALE WALK,540230,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
1,202344030R,CHEM AFFINITY LEARNING CENTRE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,04-11-2023,85509,na,BEACH ROAD,189695,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
2,T15LL1885G,EDUREACH SERVICES LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,11-11-2015,85509,74901,TAMPINES STREET 23,527201,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
3,53200915X,THINK ARTS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,06-10-2011,85509,na,YARROW GARDENS,455021,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4,201733719E,JUS INFANTS @ MACPHERSON PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,22-11-2017,88911,na,KALLANG PUDDING ROAD,349318,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
5,53227394W,MATHS TABLET,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,04-12-2012,85509,na,ANG MO KIO AVENUE 10,560555,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
6,202209857Z,YORK EDUCATION PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,22-03-2022,88911,85101,CASHEW ROAD,679637,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
7,201711911W,MAPLEBEAR LEARNING GARDEN PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,01-05-2017,88911,88912,BRADDELL ROAD,579713,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
8,201540131W,4HANDS DENTAL ASSISTING TRAINING SCHOOL PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,10-11-2015,88991,na,JURONG WEST STREET 64,641684,Others,Hospital,Social Services (Without Accommodations),Job Training And Vocational Rehabilitation Ser...
9,202337418G,OUT OF THE BOX ACADEMY (CLEMENTI) PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,18-09-2023,88912,85509,CLEMENTI AVENUE 3,120433,Others,Hospital,Social Services (Without Accommodations),Student Care Services; Child Minding Services ...


### Mining RecordOwl

In [4]:

# acra_data_filtered_by_industry = pd.DataFrame({
#     "UEN": ["201711911W"]
# })

In [None]:
client = ApifyClient("apify_api_xqctmgUBzh5ukWumUVT9SwlnOxEdft4dpNI6")

SOCIAL_MEDIA_DOMAINS = [
    "facebook.com", "linkedin.com", "instagram.com", "youtube.com",
    "tiktok.com", "twitter.com", "x.com", "pinterest.com"
]

def fetch_dataset_items_safe(dataset_client, max_retries=5, initial_wait=3):
    """Safely fetch dataset items with multiple retry strategies."""
    dataset_items = []
    
    for attempt in range(max_retries):
        try:
            # Strategy 1: Try using iterate_items() (streaming)
            try:
                dataset_items = list(dataset_client.iterate_items())
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)  # Exponential backoff
                    print(f"  ‚ö†Ô∏è Iteration method failed (attempt {attempt + 1}/{max_retries}), trying direct fetch in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ö†Ô∏è Iteration method failed after all retries, trying direct fetch...")
            
            # Strategy 2: Try using list_items() (direct pagination)
            try:
                offset = 0
                limit = 100
                while True:
                    page = dataset_client.list_items(offset=offset, limit=limit, clean=True)
                    if not page.items:
                        break
                    dataset_items.extend(page.items)
                    if len(page.items) < limit:
                        break
                    offset += limit
                
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)
                    print(f"  ‚ö†Ô∏è Direct fetch failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ùå All fetch methods failed: {e}")
                    return []
                    
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = initial_wait * (2 ** attempt)
                print(f"  ‚ö†Ô∏è Unexpected error (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"  ‚ùå Failed after all retries: {e}")
                return []
    
    return dataset_items

def run_apify_with_retry(client, run_input, uen_batch, max_retries=3):
    """Run Apify with exponential backoff on 403 errors AND verify dataset has items."""
    for attempt in range(max_retries):
        try:
            uen_list_str = ", ".join(uen_batch)
            print(f"  üì° Starting Apify run for batch: {uen_list_str} (attempt {attempt + 1}/{max_retries})...")
            run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
            print(f"  ‚è≥ Waiting for run to complete...")
            run_client = client.run(run["id"])
            run_info = run_client.wait_for_finish()
            
            # CRITICAL FIX: Check if run actually scraped pages, not just if it "succeeded"
            if run_info and "status" in run_info:
                status = run_info.get("status")
                
                # Even if status is "SUCCEEDED", verify dataset actually has items
                if status == "SUCCEEDED" and "defaultDatasetId" in run:
                    # Quick check if dataset has any items
                    try:
                        dataset_check = client.dataset(run["defaultDatasetId"])
                        time.sleep(2)  # Brief wait for dataset to be ready
                        test_items = dataset_check.list_items(limit=1, clean=True)
                        
                        if test_items.items and len(test_items.items) > 0:
                            # Dataset has items - true success!
                            print(f"  ‚úÖ Run succeeded with data")
                            return run, None
                        else:
                            # Status says "SUCCEEDED" but dataset is EMPTY - this is a failure!
                            print(f"  ‚ö†Ô∏è Run completed but dataset is empty (likely 403 block)")
                            # Treat as 403 and retry
                            if attempt < max_retries - 1:
                                wait_time = 30 * (2 ** attempt)
                                print(f"  üîÑ Retrying in {wait_time}s...")
                                time.sleep(wait_time)
                                continue
                            else:
                                return None, "Dataset empty after all retries (403 blocking)"
                    except Exception as e:
                        print(f"  ‚ö†Ô∏è Could not verify dataset: {e}")
                        # If we can't check dataset, try to use the run anyway
                        return run, None
                
                elif status != "SUCCEEDED":
                    # Check error message for 403
                    error_msg = str(run_info)
                    if "403" in error_msg or "blocked" in error_msg.lower():
                        if attempt < max_retries - 1:
                            wait_time = 30 * (2 ** attempt)  # 30s, 60s, 120s
                            print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                            time.sleep(wait_time)
                            continue
            
            return run, None
            
        except Exception as e:
            error_str = str(e)
            if "403" in error_str or "blocked" in error_str.lower():
                if attempt < max_retries - 1:
                    wait_time = 30 * (2 ** attempt)
                    print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                    time.sleep(wait_time)
                    continue
            return None, f"Apify call failed: {str(e)}"
    
    return None, "Max retries exceeded due to 403 blocking"

all_results = []

# Process UENs in batches of 2
BATCH_SIZE = 5
total_rows = len(acra_data_filtered_by_industry)
total_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE

for batch_idx in range(0, total_rows, BATCH_SIZE):
    batch = acra_data_filtered_by_industry.iloc[batch_idx:batch_idx + BATCH_SIZE]
    batch_num = (batch_idx // BATCH_SIZE) + 1
    
    print(f"\n{'='*60}")
    print(f"üîé Processing Batch {batch_num}/{total_batches} ({len(batch)} UENs)")
    print(f"{'='*60}")
    
    # Build startUrls with userData for each UEN in batch
    start_urls = []
    uen_batch = []
    for _, row in batch.iterrows():
        uen = str(row["UEN"]).strip()
        uen_batch.append(uen)
        start_urls.append({
            "url": "https://recordowl.com/",
            "userData": {"uen": uen}
        })
        print(f"  üìã Added UEN to batch: {uen}")

    # Build pageFunction that reads UEN from request.userData
    page_function = """
    async function pageFunction(context) {
        const { page, log, request } = context;
        const uen = request?.userData?.uen || "";
        
        if (!uen) {
            log.error("Missing UEN in request.userData");
            return { status: 'error', uen: null, error: 'Missing UEN' };
        }
        
        log.info("Visiting RecordOwl for UEN: " + uen);

        try {
            // Step 1: Wait for search input
            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", { timeout: 30000 });
            log.info("Search input found");
            
            // Step 2: Type UEN into search box with error handling and navigation protection
            try {
                // Wait for page to be stable (no navigation happening)
                log.info("Waiting for page to stabilize...");
                await new Promise(r => setTimeout(r, 2000)); // Wait for any auto-navigation to complete
                
                // Wait for input to be present and stable
                await page.waitForSelector("input[placeholder='Search company name, industry, or address']", { 
                    timeout: 30000,
                    visible: true 
                });
                
                // Re-find input right before typing (in case page navigated)
                let input = await page.$("input[placeholder='Search company name, industry, or address']");
                if (!input) {
                    log.error("Input element not found after wait");
                    return { status: 'error', uen, error: 'Input element not found' };
                }
                
                // Clear and type with retry logic
                let typed = false;
                for (let attempt = 0; attempt < 3; attempt++) {
                    try {
                        // Re-find input on each attempt (in case context was destroyed)
                        input = await page.$("input[placeholder='Search company name, industry, or address']");
                        if (!input) {
                            throw new Error("Input not found on attempt " + (attempt + 1));
                        }
                        
                        // Click to focus
                        await input.click({ clickCount: 3 });
                        await new Promise(r => setTimeout(r, 300)); // Small delay after click
                        
                        // Clear input first
                        await page.evaluate((selector) => {
                            const el = document.querySelector(selector);
                            if (el) el.value = '';
                        }, "input[placeholder='Search company name, industry, or address']");
                        
                        // Type UEN
                        await input.type(uen, { delay: 100 });
                        typed = true;
                        log.info("UEN typed successfully: " + uen);
                        break;
                    } catch (typeErr) {
                        if (typeErr.message.includes("Execution context was destroyed") || 
                            typeErr.message.includes("navigation")) {
                            log.info("Navigation occurred during typing (attempt " + (attempt + 1) + "/3), retrying...");
                            // Wait for page to stabilize after navigation
                            await new Promise(r => setTimeout(r, 2000));
                            // Re-wait for input
                            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", { 
                                timeout: 10000,
                                visible: true 
                            });
                            continue;
                        } else {
                            throw typeErr;
                        }
                    }
                }
                
                if (!typed) {
                    log.error("Failed to type UEN after all retries");
                    return { status: 'error', uen, error: 'Failed to type UEN after retries' };
                }
                
            } catch (typeErr) {
                log.error("Error typing UEN: " + typeErr.message);
                return { status: 'error', uen, error: 'Failed to type UEN: ' + typeErr.message };
            }

            // Step 3: Submit search with flexible waiting strategy
            try {
                log.info("Clicking submit button...");
                
                // Click submit button first
                await page.click("button[type='submit']");
                log.info("Submit button clicked");
                
                // Wait for either navigation OR results to appear (more flexible)
                // Strategy: Wait for results to appear, with navigation as optional
                try {
                    // Option 1: Wait for navigation (if it happens) - non-blocking
                    const navigationPromise = page.waitForNavigation({ 
                        waitUntil: 'networkidle2', 
                        timeout: 30000 
                    }).catch(() => {
                        log.info("Navigation did not occur (may be client-side routing)");
                        return null;
                    });
                    
                    // Option 2: Wait for results to appear (more reliable)
                    const resultsPromise = page.waitForSelector("a[href*='/company/']", { 
                        timeout: 60000 
                    });
                    
                    // Wait for either navigation or results (whichever happens first)
                    await Promise.race([
                        navigationPromise,
                        resultsPromise
                    ]);
                    
                    // Give page time to stabilize
                    await new Promise(r => setTimeout(r, 2000));
                    log.info("Page stabilized after submit");
                    
                } catch (waitErr) {
                    // If both navigation and results wait failed, try one more time for results
                    log.info("Initial wait failed, trying again for results: " + waitErr.message);
                    try {
                        await page.waitForSelector("a[href*='/company/']", { timeout: 30000 });
                        log.info("Results found on retry");
                    } catch (retryErr) {
                        log.info("No company links found after submit, might be not found");
                        return { status: 'not_found', uen };
                    }
                }
                
            } catch (navErr) {
                log.error("Error during submit: " + navErr.message);
                // Don't fail immediately - try to check if results are already there
                try {
                    const hasResults = await page.$("a[href*='/company/']");
                    if (hasResults) {
                        log.info("Results found despite submit error");
                    } else {
                        return { status: 'error', uen, error: 'Submit failed: ' + navErr.message };
                    }
                } catch (checkErr) {
                    return { status: 'error', uen, error: 'Submit failed: ' + navErr.message };
                }
            }

            // Step 4: Verify search results are present
            log.info("Verifying company links are present...");
            try {
                // Double-check that results are actually there
                await page.waitForSelector("a[href*='/company/']", { timeout: 10000 });
                log.info("Company links confirmed");
            } catch (e) {
                log.info("No company links found, might be not found");
                return { status: 'not_found', uen };
            }

            // Step 5: Find the correct company link (in a new execution context after navigation)
            let companyLink;
            try {
                companyLink = await page.evaluate((searchUen) => {
                    const links = Array.from(document.querySelectorAll("a[href*='/company/']"));
                    
                    // Find link where UEN appears in text or URL
                    const uenUpper = searchUen.toUpperCase();
                    const uenLower = searchUen.toLowerCase();
                    
                    for (const a of links) {
                        const text = (a.innerText || "").toUpperCase();
                        const href = (a.href || "").toLowerCase();
                        
                        // Check if UEN appears in text or URL (case-insensitive)
                        if (text.includes(uenUpper) || href.includes(uenLower)) {
                            console.log("Found UEN match: " + a.href);
                            return a.href;
                        }
                    }
                    
                    // Fallback: Take first company link if available
                    if (links.length > 0) {
                        console.log("No exact UEN match, using first link: " + links[0].href);
                        return links[0].href;
                    }
                    
                    console.log("No company links found");
                    return null;
                }, uen);
                
                if (!companyLink) {
                    log.info("No company links found on results page");
                    return { status: 'not_found', uen };
                }
                log.info("Found company link: " + companyLink);
            } catch (evalErr) {
                log.error("Error finding company link: " + evalErr.message);
                return { status: 'error', uen, error: 'Failed to find company link: ' + evalErr.message };
            }

            // Step 6: Navigate to company page if not already there
            if (page.url() !== companyLink) {
                try {
                    log.info("Navigating to company page...");
                    await page.goto(companyLink, { 
                        waitUntil: 'networkidle2', 
                        timeout: 60000 
                    });
                    log.info("Company page loaded");
                    
                    // Critical: Wait for page to fully stabilize
                    await new Promise(r => setTimeout(r, 5000));
                } catch (gotoErr) {
                    log.error("Error navigating to company page: " + gotoErr.message);
                    return { status: 'error', uen, error: 'Failed to load company page: ' + gotoErr.message };
                }
            }

            // Step 7: Wait for content to load (with multiple fallback strategies)
            log.info("Waiting for page content...");
            try {
                await Promise.race([
                    page.waitForSelector('dt', { timeout: 15000 }),
                    page.waitForSelector('dl', { timeout: 15000 }),
                    page.waitForSelector('.max-w-7xl', { timeout: 15000 }),
                    new Promise(r => setTimeout(r, 10000)) // Fallback: just wait 10s
                ]);
                log.info("Content loaded");
            } catch (contentErr) {
                log.info("Content wait timeout, but continuing: " + contentErr.message);
            }
            
            // Additional stabilization wait
            await new Promise(r => setTimeout(r, 3000));
            
            // Step 7.5: VERIFY we're on the correct company page
            log.info("Verifying UEN on company page...");
            try {
                const pageUEN = await page.evaluate((searchUen) => {
                    const pageText = (document.body.innerText || "").toUpperCase();
                    return pageText.includes(searchUen.toUpperCase());
                }, uen);
                
                if (pageUEN) {
                    log.info("‚úì UEN verified on page: " + uen);
                } else {
                    log.info("‚ö† Warning: UEN not found in page text, but continuing...");
                }
            } catch (verifyErr) {
                log.info("Could not verify UEN, but continuing: " + verifyErr.message);
            }
            
            // Step 8: Extract content (in stable context) - ONLY VISIBLE ELEMENTS
            let html_content, title, url;
            try {
                // Get only the visible HTML content by removing hidden elements
                await page.evaluate(() => {
                    // Remove all elements that are hidden from view
                    const allElements = document.querySelectorAll('*');
                    allElements.forEach(el => {
                        const style = window.getComputedStyle(el);
                        // Mark hidden elements with a special attribute
                        if (style.display === 'none' || 
                            style.visibility === 'hidden' || 
                            style.opacity === '0' ||
                            el.hidden ||
                            el.hasAttribute('hidden')) {
                            el.setAttribute('data-hidden-element', 'true');
                        }
                    });
                });
                
                html_content = await page.content();
                title = await page.title();
                url = page.url();
                log.info("Successfully extracted HTML content (" + html_content.length + " chars)");
            } catch (extractErr) {
                log.error("Error extracting content: " + extractErr.message);
                return { status: 'error', uen, error: 'Failed to extract content: ' + extractErr.message };
            }

            return { status: 'success', uen, url, title, html_content };
            
        } catch (err) {
            log.error("Unexpected error in pageFunction: " + err.message);
            log.error("Stack: " + err.stack);
            return { status: 'error', uen, error: err.message };
        }
    }
    """

    run_input = {
        "startUrls": start_urls,
        "useChrome": True,
        "headless": True,
        "stealth": True,
        "pageFunction": page_function,
        "ignoreSslErrors": False,
        "ignoreCorsAndCsp": False,
        "maxRequestRetries": 3,
        "maxRequestsPerCrawl": len(start_urls),  # Allow all UENs in batch
        "maxConcurrency": 5,  # Process 2 UENs in parallel
        "pageLoadTimeoutSecs": 90,
        "pageFunctionTimeoutSecs": 180,
        "waitUntil": ["networkidle2"],
        "proxyConfiguration": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"],
        },
        "proxyRotation": "RECOMMENDED",
    }

    # Use retry logic for 403 errors (5 attempts = more chances to recover)
    run, error = run_apify_with_retry(client, run_input, uen_batch, max_retries=5)

    if error or not run:
        print(f"  ‚ùå Apify call failed for batch: {error}")
        # Add error results for all UENs in batch
        for uen in uen_batch:
            all_results.append({
                "UEN": uen,
                "Emails": None,
                "Phones": None,
                "Website": None,
                "Facebook": None,
                "LinkedIn": None,
                "Instagram": None,
                "TikTok": None,
                "address": None,
                "RecordOwl_Link": None,
                "Error": error or "No run returned"
            })
        time.sleep(10)  # Longer sleep after failure
        continue

    if not run or "defaultDatasetId" not in run:
        print(f"  ‚ö†Ô∏è No valid dataset returned for batch")
        # Add error results for all UENs in batch
        for uen in uen_batch:
            all_results.append({
                "UEN": uen,
                "Emails": None,
                "Phones": None,
                "Website": None,
                "Facebook": None,
                "LinkedIn": None,
                "Instagram": None,
                "TikTok": None,
                "address": None,
                "RecordOwl_Link": None,
                "Error": "No dataset returned"
            })
        continue

    # Wait for dataset to be ready with progressive checking
    print(f"  ‚è≥ Waiting for dataset to be ready...")
    time.sleep(8)  # Increased wait for concurrent requests (both need to complete)
    
    # Try to fetch dataset with progressive waits
    dataset_client = client.dataset(run["defaultDatasetId"])
    for check_attempt in range(5):
        try:
            # Check if dataset has expected number of items
            test_fetch = dataset_client.list_items(limit=len(uen_batch) + 1, clean=True)
            if test_fetch.items and len(test_fetch.items) >= len(uen_batch):
                break
        except:
            pass
        
        if check_attempt < 4:
            additional_wait = 5 * (check_attempt + 1)
            print(f"  ‚è≥ Dataset not ready, waiting {additional_wait}s more...")
            time.sleep(additional_wait)
    
    # Fetch dataset items with improved error handling
    dataset_items = fetch_dataset_items_safe(
        dataset_client,
        max_retries=5,
        initial_wait=5
    )
    
    # Process items
    if not dataset_items:
        print(f"  ‚ö†Ô∏è Dataset is empty - no items returned!")
    else:
        print(f"  üìä Dataset has {len(dataset_items)} item(s)")
    
    # Create a mapping of UEN to dataset item
    uen_to_item = {}
    for item in dataset_items:
        item_uen = item.get("uen")
        if item_uen:
            uen_to_item[item_uen] = item
    
    # Process each UEN in the batch
    for uen in uen_batch:
        print(f"\n  üîç Processing results for {uen}...")
        
        item = uen_to_item.get(uen)
        scraped_html, record_owl_url = None, None
        
        if item:
            if item.get("status") == "success":
                scraped_html = item.get("html_content", "")
                record_owl_url = item.get("url")
                if scraped_html:
                    print(f"  ‚úÖ Successfully scraped {uen} ({len(scraped_html)} chars of HTML)")
                else:
                    print(f"  ‚ö†Ô∏è Status is 'success' but html_content is empty for {uen}")
            elif item.get("status") == "not_found":
                print(f"  ‚ö†Ô∏è Company not found for UEN {uen}")
            elif item.get("status") == "error":
                print(f"  ‚ùå Error for {uen}: {item.get('error')}")
            else:
                print(f"  ‚ö†Ô∏è Unknown item status for {uen}: {item.get('status')}")
        else:
            print(f"  ‚ö†Ô∏è No dataset item found for UEN {uen}")

        if not scraped_html:
            # Determine the specific reason for failure
            if not item:
                error_reason = "No dataset item returned"
            elif item.get("status") == "not_found":
                error_reason = "Company not found on RecordOwl"
            elif item.get("status") == "error":
                error_reason = f"Scraping error: {item.get('error', 'Unknown')}"
            else:
                error_reason = "No HTML content retrieved (unknown reason)"
            
            print(f"  ‚ùå {error_reason}")
            
            all_results.append({
                "UEN": uen,
                "Emails": None,
                "Phones": None,
                "Website": None,
                "Facebook": None,
                "LinkedIn": None,
                "Instagram": None,
                "TikTok": None,
                "address": None,
                "RecordOwl_Link": record_owl_url or None,
                "Error": error_reason
            })
            continue

        # Parse HTML (keep all existing parsing logic unchanged)
        try:
            soup = BeautifulSoup(scraped_html, "html.parser")
            
            # ========== CLEAN HTML: REMOVE HIDDEN/UNWANTED ELEMENTS ==========
            # Remove hidden elements
            for elem in soup.find_all(attrs={"data-hidden-element": "true"}):
                elem.decompose()
            
            # Target company overview (exclude officer/director personal data)
            overview_tab = (soup.select_one("#overview") or 
                           soup.select_one("[aria-labelledby*='overview']") or
                           soup.select_one("div[role='tabpanel']"))
            
            if overview_tab:
                parent = overview_tab
            else:
                parent = soup.select_one("div.max-w-7xl.mx-auto.lg\\:py-6.sm\\:px-6.lg\\:px-8")
                if parent:
                    # Remove officer/shareholder sections
                    for unwanted in parent.select("#officers, #shareholders, #appointments, "
                                                 "[id*='officer'], [id*='shareholder'], [id*='appointment']"):
                        unwanted.decompose()
            
            # Remove non-visible content
            if parent:
                for unwanted in parent.select("script, style, noscript, [style*='display:none']"):
                    unwanted.decompose()
            # ========== END CLEAN HTML ==========

            emails, phones, website = [], [], None
            facebook_links, linkedin_links, instagram_links, tiktok_links = [], [], [], []
            
            # Helper function to check if element is visible
            def is_element_visible(element):
                """Check if a BeautifulSoup element appears to be visible (not hidden)."""
                if element is None:
                    return False
                # Check for hidden attribute
                if element.has_attr('data-hidden-element'):
                    return False
                # Check for common hidden styles
                style = element.get('style', '')
                if any(hidden_style in style.lower() for hidden_style in ['display:none', 'display: none', 'visibility:hidden', 'visibility: hidden']):
                    return False
                # Check for hidden/aria-hidden attributes
                if element.get('hidden') or element.get('aria-hidden') == 'true':
                    return False
                return True

            if parent:
                # Extract emails
                for a in parent.select("a[href^=mailto]"):
                    email = a.get("href", "").replace("mailto:", "").strip()
                    if email and email not in emails and "@" in email:
                        emails.append(email)

                # ========== COMPREHENSIVE SINGAPORE PHONE EXTRACTION ==========
                # (Keep all existing phone extraction code exactly as is)
                # ... [All the phone extraction code remains unchanged] ...
                
                def validate_sg_phone(digits_str):
                    """
                    Validate and format Singapore phone number from digit-only string.
                    """
                    if not digits_str or len(digits_str) < 8:
                        return None
                    
                    non_sg_codes = [
                        "60", "62", "63", "66", "84", "95", "855", "856", "880",
                        "81", "82", "86", "852", "853", "886",
                        "91", "92", "93", "94",
                        "61", "64",
                        "90", "98",
                    ]
                    
                    if len(digits_str) == 8 and digits_str[0] in "689":
                        return "+65" + digits_str
                    
                    if len(digits_str) >= 9:
                        for code in non_sg_codes:
                            if digits_str.startswith(code):
                                return None
                    
                    if len(digits_str) == 10 and digits_str.startswith("65") and digits_str[2] in "689":
                        return "+" + digits_str
                        
                    elif len(digits_str) > 10:
                        for i in range(len(digits_str) - 9):
                            if digits_str[i:i+2] == "65" and digits_str[i+2] in "689":
                                if i > 0:
                                    prev_digits = digits_str[max(0, i-2):i]
                                    is_part_of_other_code = any(
                                        code.endswith(prev_digits + "65") 
                                        for code in non_sg_codes
                                    )
                                    if is_part_of_other_code:
                                        continue
                                return "+" + digits_str[i:i+10]
                    
                    return None
                
                # Method 1: Extract from tel: links
                tel_links = [link for link in parent.select("a[href^='tel:'], a[href^='tel']") 
                            if is_element_visible(link)]
                
                for a in tel_links:
                    tel_href = a.get("href", "").replace("tel:", "").strip()
                    digits_only = re.sub(r"\D", "", tel_href)
                    formatted = validate_sg_phone(digits_only)
                    if formatted and formatted not in phones:
                        phones.append(formatted)
                
                # Method 2: Extract from dt/dd structure
                company_keywords = ["company contact", "business contact", "office phone", 
                                  "main phone", "business phone", "company phone", "contact number", 
                                  "phone", "tel", "mobile", "call", "contact no"]
                exclude_keywords = ["officer", "charge", "employee", "shareholder", "director", 
                                  "registration", "person", "individual", "member", "partner",
                                  "manager", "owner", "proprietor", "authorized", "representative",
                                  "appointment", "designation", "name of", "appointed"]
                
                visible_dt_tags = [dt for dt in parent.select("dt") if is_element_visible(dt)]
                
                for dt in visible_dt_tags:
                    dt_text = dt.get_text(strip=True).lower()
                    
                    is_company = any(kw in dt_text for kw in company_keywords)
                    is_personal = any(excl in dt_text for excl in exclude_keywords)
                    
                    if is_company and not is_personal:
                        dd = dt.find_next_sibling("dd")
                        if dd and is_element_visible(dd):
                            number_text = dd.get_text(" ", strip=True)
                            all_digits = re.sub(r"\D", "", number_text)
                            formatted = validate_sg_phone(all_digits)
                            if formatted and formatted not in phones:
                                phones.append(formatted)
                
                # Method 3: Fallback text search
                if not phones:
                    full_text = parent.get_text()
                    
                    sg_patterns = [
                        r"\+[\s\-\.]*65[\s\-\.]*[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d",
                        r"\([\s\-\.]*\+?[\s\-\.]*65[\s\-\.]*\)[\s\-\.]*[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d",
                        r"\+?[\s\-\.]*65[\s\-\.]*\([\s\-\.]*[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\)[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d",
                        r"(?<!\d)65[\s\-\.]+[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d(?!\d)",
                        r"(?<!\d)[689][\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d[\s\-\.]*\d(?!\d)",
                        r"(?<!\d)65[689]\d{7}(?!\d)",
                    ]
                    
                    for pattern in sg_patterns:
                        matches = re.findall(pattern, full_text)
                        for match in matches:
                            digits = re.sub(r"\D", "", match)
                            formatted = validate_sg_phone(digits)
                            if formatted and formatted not in phones:
                                phones.append(formatted)
                # ========== END PHONE EXTRACTION ==========

                # Extract website
                valid_websites = []
                for a in parent.select("a[href^=http]"):
                    href = a.get("href", "").strip()
                    href_lower = href.lower()
                    if not any(domain in href_lower for domain in SOCIAL_MEDIA_DOMAINS):
                        if not any(skip in href_lower for skip in ["recordowl", "apify.com"]):
                            if any(tld in href for tld in [".com", ".sg", ".net", ".org", ".co"]):
                                valid_websites.append(href)
                website = valid_websites[0] if valid_websites else None

            # Extract social media links from entire page
            for a in soup.find_all("a", href=True):
                href = a["href"].strip().lower()
                if "facebook.com" in href and href not in facebook_links:
                    facebook_links.append(href)
                elif "linkedin.com" in href and href not in linkedin_links:
                    linkedin_links.append(href)
                elif "instagram.com" in href and href not in instagram_links:
                    instagram_links.append(href)
                elif "tiktok.com" in href and href not in tiktok_links:
                    tiktok_links.append(href)

            # Extract registered address
            address = None
            try:
                label_candidates = ["registered address", "registered office address", "address", "principal place of business"]
                for dt in soup.select("dt"):
                    dt_text_lower = dt.get_text(" ", strip=True).lower()
                    if any(lbl in dt_text_lower for lbl in label_candidates):
                        dd = dt.find_next_sibling("dd")
                        if dd:
                            candidate = " ".join(dd.get_text(" ", strip=True).split())
                            if candidate:
                                address = candidate
                                break
                if not address:
                    addr_el = (soup.select_one("#address") or
                               soup.select_one("[id*='address']") or
                               soup.select_one("[aria-labelledby*='address']"))
                    if addr_el:
                        candidate = " ".join(addr_el.get_text(" ", strip=True).split())
                        if candidate:
                            address = candidate
            except Exception:
                address = None

            all_results.append({
                "UEN": uen,
                "Emails": emails if emails else None,
                "Phones": phones if phones else None,
                "Website": website,
                "Facebook": list(set(facebook_links)) if facebook_links else None,
                "LinkedIn": list(set(linkedin_links)) if linkedin_links else None,
                "Instagram": list(set(instagram_links)) if instagram_links else None,
                "TikTok": list(set(tiktok_links)) if tiktok_links else None,
                "address": address,
                "RecordOwl_Link": record_owl_url,
            })
            
            # Print extraction results with actual phone numbers
            if phones:
                phone_list = ", ".join(phones)
                print(f"  ‚úÖ Extracted: {len(emails) if emails else 0} email(s), {len(phones)} phone(s): {phone_list}")
            else:
                print(f"  ‚úÖ Extracted: {len(emails) if emails else 0} email(s), Phone: None found")
            
        except Exception as e:
            print(f"  ‚ùå Error parsing HTML for {uen}: {e}")
            all_results.append({
                "UEN": uen,
                "Emails": None,
                "Phones": None,
                "Website": None,
                "Facebook": None,
                "LinkedIn": None,
                "Instagram": None,
                "TikTok": None,
                "address": None,
                "RecordOwl_Link": record_owl_url or None,
                "Error": f"HTML parsing error: {str(e)}"
            })

    # Dynamic sleep time to avoid rate limiting and 403 blocks (between batches)
    base_sleep = 20
    random_addition = (batch_num % 10) + 5
    sleep_time = base_sleep + random_addition

    print(f"\n  üí§ Sleeping for {sleep_time}s before next batch...")
    time.sleep(sleep_time)

    # Extra delay after every 5th batch to further avoid detection
    if batch_num % 5 == 0:
        extra_wait = 30
        print(f"  üõë Checkpoint pause: waiting extra {extra_wait}s...")
        time.sleep(extra_wait)

New_Fresh_Leads = pd.DataFrame(all_results)

# Ensure 'address' appears right after 'UEN'
if 'address' in New_Fresh_Leads.columns and 'UEN' in New_Fresh_Leads.columns:
    cols = list(New_Fresh_Leads.columns)
    cols.insert(1, cols.pop(cols.index('address')))
    New_Fresh_Leads = New_Fresh_Leads.loc[:, cols]

print("\n‚úÖ Scraping complete!")
print(f"\nüìä Results summary:")
print(f"   Total processed: {len(New_Fresh_Leads)}")
print(f"   With emails: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"   With phones: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"   With websites: {New_Fresh_Leads['Website'].notna().sum()}")

New_Fresh_Leads.head(10)


üîé Processing Batch 1/2 (5 UENs)
  üìã Added UEN to batch: 53431824W
  üìã Added UEN to batch: 202344030R
  üìã Added UEN to batch: T15LL1885G
  üìã Added UEN to batch: 53200915X
  üìã Added UEN to batch: 201733719E
  üì° Starting Apify run for batch: 53431824W, 202344030R, T15LL1885G, 53200915X, 201733719E (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:VX99G6Zb7OzSz3Y6r][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:VX99G6Zb7OzSz3Y6r][0m -> 2025-11-13T02:47:28.219Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:VX99G6Zb7OzSz3Y6r][0m -> 2025-11-13T02:47:28.221Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:VX99G6Zb7OzSz3Y6r][0m -> 2025-11-13T02:47:28.287Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:VX99G6Zb7OzSz3Y6r][0m -> 2025-11-13T02:47:28.486Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:VX99G6Zb7OzSz3Y6r][0m -> 2025-11-13T02:47:29.904Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:VX9

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  ‚è≥ Dataset not ready, waiting 5s more...
  ‚è≥ Dataset not ready, waiting 10s more...
  ‚è≥ Dataset not ready, waiting 15s more...
  ‚è≥ Dataset not ready, waiting 20s more...
  üìä Dataset has 1 item(s)

  üîç Processing results for 53431824W...
  ‚úÖ Successfully scraped 53431824W (1118529 chars of HTML)
  ‚úÖ Extracted: 0 email(s), Phone: None found

  üîç Processing results for 202344030R...
  ‚ö†Ô∏è No dataset item found for UEN 202344030R
  ‚ùå No dataset item returned

  üîç Processing results for T15LL1885G...
  ‚ö†Ô∏è No dataset item found for UEN T15LL1885G
  ‚ùå No dataset item returned

  üîç Processing results for 53200915X...
  ‚ö†Ô∏è No dataset item found for UEN 53200915X
  ‚ùå No dataset item returned

  üîç Processing results for 201733719E...
  ‚ö†Ô∏è No dataset item found for UEN 201733719E
  ‚ùå No dataset item returned

  üí§ Sleeping for 26s befor

[36m[apify.puppeteer-scraper runId:pGuGaOMdCMNL1C3UT][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:pGuGaOMdCMNL1C3UT][0m -> 2025-11-13T02:49:58.383Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:pGuGaOMdCMNL1C3UT][0m -> 2025-11-13T02:49:58.385Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:pGuGaOMdCMNL1C3UT][0m -> 2025-11-13T02:49:58.424Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:pGuGaOMdCMNL1C3UT][0m -> 2025-11-13T02:49:58.637Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:pGuGaOMdCMNL1C3UT][0m -> 2025-11-13T02:49:59.307Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:pGu

KeyboardInterrupt: 

### Address Formatting 

In [None]:
# Pre-compile patterns for speed
POSTAL_RE = re.compile(r"(?:\bSingapore\b\s*)?(?P<postal>\d{6})(?!\d)", re.IGNORECASE)
UNIT_RES = [
    re.compile(r"#\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}\b", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,5}\b", re.IGNORECASE),
]

def normalize_spaces(text: str) -> str:
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip(" ,;|/")

def extract_postal(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    matches = list(POSTAL_RE.finditer(text))
    if matches:
        m = matches[-1]
        postal = m.group("postal")
        start, end = m.span()
        cleaned = text[:start] + text[end:]
        cleaned = re.sub(r"\bSingapore\b", "", cleaned, flags=re.IGNORECASE)
        return normalize_spaces(cleaned), postal
    return normalize_spaces(text), None

def extract_unit(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    for rx in UNIT_RES:
        m = rx.search(text)
        if m:
            unit_raw = m.group(0)
            cleaned = normalize_spaces(text[:m.start()] + text[m.end():])
            unit_digits = re.sub(r"^unit\s*[#:]?\s*", "", unit_raw, flags=re.IGNORECASE)
            unit_digits = normalize_spaces(unit_digits)
            unit_digits = unit_digits.replace(' ‚Äì ', '-').replace('‚Äì', '-').replace(' ', '')
            unit_digits = unit_digits.lstrip('#')
            return cleaned, unit_digits
    return normalize_spaces(text), None

def clean_street(text: str) -> str | None:
    if not text:
        return None
    text = normalize_spaces(text)
    text = re.sub(r"\s*,\s*", ", ", text)
    return text if text.isupper() else text.title()

def split_address_sg(address: str) -> dict:
    if not isinstance(address, str) or not address.strip():
        return {"street": None, "unit": None, "postal_code": None, "address_clean": None}
    raw = normalize_spaces(address)
    without_postal, postal = extract_postal(raw)
    without_unit, unit = extract_unit(without_postal)
    without_unit = normalize_spaces(re.sub(r"\bSingapore\b", "", without_unit, flags=re.IGNORECASE))
    street = clean_street(without_unit)
    address_clean = normalize_spaces(" ".join(x for x in [street or "", unit or "", f"Singapore {postal}" if postal else ""] if x))
    return {"street": street, "unit": unit, "postal_code": postal, "address_clean": address_clean}

# Apply to current result DF -> create a new dataframe with clean components
if 'address' not in New_Fresh_Leads.columns:
    raise ValueError("Column 'address' not found in New_Fresh_Leads. Run the scraping cell first.")

parsed_df = pd.DataFrame(list(New_Fresh_Leads["address"].apply(split_address_sg)))

# New DataFrame with clean address fields and without raw 'address'
Cleaned_New_Fresh_Leads = New_Fresh_Leads.copy()
if 'address' in Cleaned_New_Fresh_Leads.columns:
    Cleaned_New_Fresh_Leads = Cleaned_New_Fresh_Leads.drop(columns=['address'])
Cleaned_New_Fresh_Leads["operational_street"] = parsed_df["street"]
Cleaned_New_Fresh_Leads["operational_unit"] = parsed_df["unit"]
Cleaned_New_Fresh_Leads["operational_postal_code"] = parsed_df["postal_code"]
Cleaned_New_Fresh_Leads["operational_address"] = parsed_df["address_clean"]

# Save full result to a new DataFrame and display all columns
New_Fresh_Leads_Operational = Cleaned_New_Fresh_Leads.copy()

In [None]:
New_Fresh_Leads_Operational

In [None]:
# New_Fresh_Leads_Operational.to_csv("New_Fresh_Leads_Operational.csv")