### Data Mining in RecordOwl (Silver 1)

In [46]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


### Ingesting from previous layer

In [49]:
parquet_path = "./Staging/Bronze/bronze_data_1.parquet"
if os.path.exists(parquet_path):
    acra_data_filtered_by_industry = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(acra_data_filtered_by_industry)} rows from {parquet_path}")
    print(acra_data_filtered_by_industry.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

Loaded 200 rows from ./Staging/Bronze/bronze_data_1.parquet
(200, 14)


In [48]:
acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,202418445R,CL EDUCATION PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,08-05-2024,88911,na,MOUNT SINAI DRIVE,277116,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
1,53471474M,CORE CONCEPTS TUITION,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-08-2023,85509,na,COMPASSVALE ROAD,544753,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
2,202527935C,EDULUMINA PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,28-06-2025,85509,na,CECIL STREET,049705,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
3,200510117W,AUG GLOBAL NETWORK SINGAPORE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,21-07-2005,85501,64202,MAXWELL ROAD,69111,Others,Educational,Industry-Specific Vocational & Professional Tr...,"Training Courses For Construction, Real Estate..."
4,202309115H,LUCIDUS INTERNATIONAL PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,10-03-2023,85509,85402,MOUNT SINAI ROAD,276881,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,53466472E,VALWORKS,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,25-04-2023,88991,na,MONTREAL LINK,751592,Others,Hospital,Social Services (Without Accommodations),Job Training And Vocational Rehabilitation Ser...
196,201410937C,CARLTON INTERNATIONAL EDUCATION GROUP PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,15-04-2014,85509,85409,UPPER PAYA LEBAR ROAD,534818,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
197,202001114R,ACE @ WORK EDUFUN PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,08-01-2020,88912,na,BUKIT BATOK CRESCENT,658080,Others,Hospital,Social Services (Without Accommodations),Student Care Services; Child Minding Services ...
198,202508287E,YANG AND HUAT PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,23-01-2025,85509,na,HOLLAND HILL,278738,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.


### Mining RecordOwl

In [None]:
# IMPROVED SCRAPER - With comprehensive extraction from Silver_2
from apify_client import ApifyClient
import pandas as pd
import time

client = ApifyClient("apify_api_kgg8oADJ45eu4ofv98kKCGAbLkh4293cDr0f")

# CONFIG
BATCH_SIZE = 5
MAX_CONCURRENCY = 2
MAX_RETRIES = 2

def create_pagefunction_v4() -> str:
    """Enhanced pageFunction with comprehensive data extraction from Silver_2"""
    return """
async function pageFunction(context) {
    const { page, log, request } = context;
    const uen = request?.userData?.uen || '';
    const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms));

    log.info(`üöÄ SCRAPING UEN: ${uen} from ${page.url()}`);
    if (!uen) return { status: 'error', uen: null, error: 'Missing UEN' };

    try {
        // Wait for search results
        log.info('‚è≥ Waiting for search results...');
        await page.waitForSelector('.flex-1.min-w-0', { timeout: 40000 });
        await wait(1500);

        // Find and click company link
        log.info('üîç Finding company link...');
        const linkFound = await page.evaluate((targetUen) => {
            const uenUpper = targetUen.toUpperCase();
            const allLinks = document.querySelectorAll('a[href*="/company/"]');
            
            for (const link of allLinks) {
                let parent = link.parentElement;
                for (let i = 0; i < 5 && parent; i++) {
                    const parentText = (parent.innerText || parent.textContent || '');
                    if (parentText.toUpperCase().includes(uenUpper)) {
                        link.click();
                        return true;
                    }
                    parent = parent.parentElement;
                }
            }
            if (allLinks.length > 0) { allLinks[0].click(); return true; }
            return false;
        }, uen);

        if (!linkFound) {
            return { status: 'not_found', uen, error: 'No search results' };
        }

        // Wait for company page
        log.info('üìÑ Navigating to company page...');
        await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 60000 });
        await wait(2000);

        // Extract data with comprehensive phone extraction from Silver_2
        log.info('üìä Extracting data...');
        const data = await page.evaluate(() => {
            const SOCIAL_MEDIA_DOMAINS = ['facebook.com','linkedin.com','instagram.com','tiktok.com','twitter.com','x.com','youtube.com','pinterest.com'];
            
            // ========== EMAIL EXTRACTION ==========
            const emails = [];
            document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
                const email = a.href.replace('mailto:', '').trim();
                if (email && email.includes('@') && !emails.includes(email)) {
                    emails.push(email);
                }
            });
            
            // ========== COMPREHENSIVE PHONE EXTRACTION ==========
            const phones = [];
            
            // Format phone to Singapore standard
            function formatSingaporePhone(text) {
                const digitsOnly = text.replace(/\\D/g, '');
                
                // Singapore number patterns (8 digits)
                if (digitsOnly.length === 8) {
                    return '+65' + digitsOnly;
                }
                // With country code
                if (digitsOnly.length === 10 && digitsOnly.startsWith('65')) {
                    return '+' + digitsOnly;
                }
                if (digitsOnly.length === 11 && digitsOnly.startsWith('65')) {
                    return '+65' + digitsOnly.slice(2);
                }
                // With +
                if (digitsOnly.length >= 10) {
                    for (let i = 0; i <= digitsOnly.length - 10; i++) {
                        if (digitsOnly.slice(i, i+2) === '65' && digitsOnly.length - i >= 10) {
                            return '+' + digitsOnly.slice(i, i+10);
                        }
                    }
                }
                return null;
            }
            
            // Method 1: Extract from tel: links
            document.querySelectorAll('a[href^="tel:"]').forEach(a => {
                const telHref = a.href.replace('tel:', '').trim();
                const formatted = formatSingaporePhone(telHref);
                if (formatted && !phones.includes(formatted)) {
                    phones.push(formatted);
                }
            });
            
            // Method 2: Extract from dt/dd structure
            const companyKeywords = ['company contact', 'business contact', 'office phone', 'main phone', 'business phone', 'company phone', 'contact number', 'phone', 'tel', 'mobile', 'call', 'contact no'];
            
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (companyKeywords.some(kw => dtText.includes(kw))) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') {
                        const ddText = dd.textContent.trim();
                        const formatted = formatSingaporePhone(ddText);
                        if (formatted && !phones.includes(formatted)) {
                            phones.push(formatted);
                        }
                    }
                }
            });
            
            // Method 3: Look for phone patterns in visible text
            const bodyText = document.body.innerText || document.body.textContent;
            const phonePatterns = [
                /\\b(\\+65[\\s\\-]?)?([689]\\d{3}[\\s\\-]?\\d{4})\\b/g,
                /\\b65[\\s\\-]?([689]\\d{3})[\\s\\-]?(\\d{4})\\b/g,
                /\\b([689]\\d{3})[\\s\\-](\\d{4})\\b/g
            ];
            
            phonePatterns.forEach(pattern => {
                const matches = bodyText.matchAll(pattern);
                for (const match of matches) {
                    const formatted = formatSingaporePhone(match[0]);
                    if (formatted && !phones.includes(formatted)) {
                        phones.push(formatted);
                    }
                }
            });
            
            // ========== WEBSITE EXTRACTION ==========
            const websites = [];
            document.querySelectorAll('a[href^="http"]').forEach(a => {
                const href = a.href.trim().toLowerCase();
                if (!SOCIAL_MEDIA_DOMAINS.some(d => href.includes(d)) && !href.includes('recordowl') && !href.includes('apify')) {
                    if (href.match(/\\.(com|sg|net|org|co)/)) {
                        websites.push(a.href);
                    }
                }
            });
            
            // ========== SOCIAL MEDIA EXTRACTION ==========
            const facebook = [];
            const linkedin = [];
            const instagram = [];
            const tiktok = [];
            
            document.querySelectorAll('a[href*="facebook.com"]').forEach(a => {
                if (!facebook.includes(a.href)) facebook.push(a.href);
            });
            document.querySelectorAll('a[href*="linkedin.com"]').forEach(a => {
                if (!linkedin.includes(a.href)) linkedin.push(a.href);
            });
            document.querySelectorAll('a[href*="instagram.com"]').forEach(a => {
                if (!instagram.includes(a.href)) instagram.push(a.href);
            });
            document.querySelectorAll('a[href*="tiktok.com"]').forEach(a => {
                if (!tiktok.includes(a.href)) tiktok.push(a.href);
            });
            
            // ========== ADDRESS EXTRACTION ==========
            let address = null;
            const labelCandidates = ['registered address', 'registered office address', 'address', 'principal place of business'];
            
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (labelCandidates.some(label => dtText.includes(label))) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') {
                        address = dd.textContent.trim();
                    }
                }
            });

            return {
                emails: emails.length ? emails : null,
                phones: phones.length ? phones : null,
                website: websites.length ? websites[0] : null,
                facebook: facebook.length ? facebook : null,
                linkedin: linkedin.length ? linkedin : null,
                instagram: instagram.length ? instagram : null,
                tiktok: tiktok.length ? tiktok : null,
                address: address
            };
        });

        log.info(`‚úÖ SUCCESS: ${uen} | Phones: ${data.phones ? data.phones.length : 0} | Emails: ${data.emails ? data.emails.length : 0}`);
        return { status: 'success', uen, url: page.url(), ...data };

    } catch (err) {
        log.error(`‚ùå ERROR for ${uen}: ${err.message}`);
        return { status: 'error', uen, error: err.message };
    }
}
"""

def run_scraper(client, uens):
    start_urls = [{"url": f"https://recordowl.com/search?name={uen}", "userData": {"uen": uen}} for uen in uens]
    
    print(f"\n  üìã Start URLs created:")
    for i, item in enumerate(start_urls, 1):
        print(f"      {i}. {item['url']}")
    
    run_input = {
        "startUrls": start_urls,
        "useChrome": True,
        "headless": True,
        "stealth": True,
        "pageFunction": create_pagefunction_v4(),
        "maxRequestRetries": 3,
        "maxRequestsPerCrawl": len(start_urls),
        "maxConcurrency": MAX_CONCURRENCY,
        "pageLoadTimeoutSecs": 120,
        "pageFunctionTimeoutSecs": 360,
        "waitUntil": ["domcontentloaded"],
        "proxyConfiguration": {"useApifyProxy": True, "apifyProxyGroups": ["RESIDENTIAL"]},
    }

    print(f"\n  üöÄ Launching Apify actor with {MAX_CONCURRENCY} concurrent browsers...")
    run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
    print(f"  ‚è≥ Run ID: {run['id']}")
    
    run_client = client.run(run["id"])
    run_info = run_client.wait_for_finish()
    
    print(f"  üìä Status: {run_info.get('status')}")
    
    if run_info.get("status") == "SUCCEEDED" and "defaultDatasetId" in run:
        time.sleep(2)
        dataset = client.dataset(run["defaultDatasetId"])
        items = list(dataset.iterate_items())
        print(f"  ‚úÖ Retrieved {len(items)}/{len(uens)} results")
        return items, None
    
    return [], "Scraping failed"


# Execute scraper
print("="*70)
print("üéØ APIFY SCRAPER - V4 (Comprehensive Extraction)")
print("="*70)
print(f"üìä Config: {MAX_CONCURRENCY} concurrent, {BATCH_SIZE} UENs per batch")

all_results = []
total_rows = len(acra_data_filtered_by_industry)

for batch_idx in range(0, total_rows, BATCH_SIZE):
    batch = acra_data_filtered_by_industry.iloc[batch_idx:batch_idx + BATCH_SIZE]
    uens = [str(row['UEN']).strip() for _, row in batch.iterrows()]
    
    print(f"\n{'‚îÄ'*70}")
    print(f"üì¶ Batch {(batch_idx//BATCH_SIZE)+1} - UENs: {', '.join(uens)}")
    
    items, error = run_scraper(client, uens)
    
    if error:
        print(f"  ‚ùå Batch error: {error}")
        for uen in uens:
            all_results.append({
                "UEN": uen, "Status": "error", "Error": error,
                **{k: None for k in ['Emails','Phones','Website','Facebook','LinkedIn','Instagram','TikTok','address','RecordOwl_Link']}
            })
        continue
    
    # Map results
    uen_map = {item.get('uen'): item for item in items if item.get('uen')}
    
    for uen in uens:
        item = uen_map.get(uen)
        if not item:
            print(f"    ‚ö†Ô∏è  {uen}: Not found in results")
            all_results.append({
                "UEN": uen, "Status": "missing", "Error": "No data",
                **{k: None for k in ['Emails','Phones','Website','Facebook','LinkedIn','Instagram','TikTok','address','RecordOwl_Link']}
            })
        else:
            status = item.get('status', 'error')
            if status == 'success':
                phones = len(item.get('phones', [])) if item.get('phones') else 0
                emails = len(item.get('emails', [])) if item.get('emails') else 0
                website = 'Yes' if item.get('website') else 'No'
                print(f"    ‚úÖ {uen}: {phones} phones, {emails} emails, Website: {website}")
            else:
                print(f"    ‚ùå {uen}: {status} - {item.get('error', 'Unknown')}")
            
            all_results.append({
                'UEN': uen,
                'Status': status,
                'Error': item.get('error'),
                'Emails': item.get('emails'),
                'Phones': item.get('phones'),
                'Website': item.get('website'),
                'Facebook': item.get('facebook'),
                'LinkedIn': item.get('linkedin'),
                'Instagram': item.get('instagram'),
                'TikTok': item.get('tiktok'),
                'address': item.get('address'),
                'RecordOwl_Link': item.get('url')
            })
    
    time.sleep(10)

# Create DataFrame
New_Fresh_Leads = pd.DataFrame(all_results)

print(f"\n{'='*70}")
print(f"‚úÖ COMPLETE")
print(f"{'='*70}")
print(f"üìä Total: {len(New_Fresh_Leads)}")
print(f"‚úÖ Success: {(New_Fresh_Leads['Status']=='success').sum()}")
print(f"‚ùå Failed: {(New_Fresh_Leads['Status']=='error').sum()}")
print(f"‚ö†Ô∏è  Missing: {(New_Fresh_Leads['Status']=='missing').sum()}")
print(f"üìû Phones found: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"üìß Emails found: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"üåê Websites found: {New_Fresh_Leads['Website'].notna().sum()}")
print(f"üìò Facebook found: {New_Fresh_Leads['Facebook'].notna().sum()}")
print(f"üì∏ Instagram found: {New_Fresh_Leads['Instagram'].notna().sum()}")
print(f"{'='*70}")

New_Fresh_Leads.head(10)

### DIAGNOSTIC Test

In [None]:
# # DIAGNOSTIC CODE - Run this to identify the issue
# try:
#     print("Testing Apify API connection...")
    
#     # Test 1: Check if API key is valid
#     user_info = client.user().get()
#     print(f"‚úÖ API Key valid - User: {user_info.get('username', 'Unknown')}")
    
#     # Test 2: Check if the actor exists
#     try:
#         actor_info = client.actor("apify/puppeteer-scraper").get()
#         print(f"‚úÖ Actor found: {actor_info.get('name', 'Unknown')}")
#     except Exception as e:
#         print(f"‚ùå Actor not found: {e}")
        
# except Exception as e:
#     print(f"‚ùå API Connection Failed: {type(e).__name__}: {e}")

### Address Formatting 

In [None]:
# Pre-compile patterns for speed
POSTAL_RE = re.compile(r"(?:\bSingapore\b\s*)?(?P<postal>\d{6})(?!\d)", re.IGNORECASE)
UNIT_RES = [
    re.compile(r"#\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}\b", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,5}\b", re.IGNORECASE),
]

def normalize_spaces(text: str) -> str:
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip(" ,;|/")

def extract_postal(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    matches = list(POSTAL_RE.finditer(text))
    if matches:
        m = matches[-1]
        postal = m.group("postal")
        start, end = m.span()
        cleaned = text[:start] + text[end:]
        cleaned = re.sub(r"\bSingapore\b", "", cleaned, flags=re.IGNORECASE)
        return normalize_spaces(cleaned), postal
    return normalize_spaces(text), None

def extract_unit(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    for rx in UNIT_RES:
        m = rx.search(text)
        if m:
            unit_raw = m.group(0)
            cleaned = normalize_spaces(text[:m.start()] + text[m.end():])
            unit_digits = re.sub(r"^unit\s*[#:]?\s*", "", unit_raw, flags=re.IGNORECASE)
            unit_digits = normalize_spaces(unit_digits)
            unit_digits = unit_digits.replace(' ‚Äì ', '-').replace('‚Äì', '-').replace(' ', '')
            unit_digits = unit_digits.lstrip('#')
            return cleaned, unit_digits
    return normalize_spaces(text), None

def clean_street(text: str) -> str | None:
    if not text:
        return None
    text = normalize_spaces(text)
    text = re.sub(r"\s*,\s*", ", ", text)
    return text if text.isupper() else text.title()

def split_address_sg(address: str) -> dict:
    if not isinstance(address, str) or not address.strip():
        return {"street": None, "unit": None, "postal_code": None, "address_clean": None}
    raw = normalize_spaces(address)
    without_postal, postal = extract_postal(raw)
    without_unit, unit = extract_unit(without_postal)
    without_unit = normalize_spaces(re.sub(r"\bSingapore\b", "", without_unit, flags=re.IGNORECASE))
    street = clean_street(without_unit)
    address_clean = normalize_spaces(" ".join(x for x in [street or "", unit or "", f"Singapore {postal}" if postal else ""] if x))
    return {"street": street, "unit": unit, "postal_code": postal, "address_clean": address_clean}

# Apply to current result DF -> create a new dataframe with clean components
if 'address' not in New_Fresh_Leads.columns:
    raise ValueError("Column 'address' not found in New_Fresh_Leads. Run the scraping cell first.")

parsed_df = pd.DataFrame(list(New_Fresh_Leads["address"].apply(split_address_sg)))

# New DataFrame with clean address fields and without raw 'address'
Cleaned_New_Fresh_Leads = New_Fresh_Leads.copy()
if 'address' in Cleaned_New_Fresh_Leads.columns:
    Cleaned_New_Fresh_Leads = Cleaned_New_Fresh_Leads.drop(columns=['address'])
Cleaned_New_Fresh_Leads["operational_street"] = parsed_df["street"]
Cleaned_New_Fresh_Leads["operational_unit"] = parsed_df["unit"]
Cleaned_New_Fresh_Leads["operational_postal_code"] = parsed_df["postal_code"]
Cleaned_New_Fresh_Leads["operational_address"] = parsed_df["address_clean"]

# Save full result to a new DataFrame and display all columns
New_Fresh_Leads_Operational = Cleaned_New_Fresh_Leads.copy()
New_Fresh_Leads_Operational



### Check for duplication of UEN and Phone Number

In [None]:
# Boolean masks for duplicates
uen_dup = New_Fresh_Leads_Operational["UEN"].duplicated(keep=False)

phone_dup = (
    New_Fresh_Leads_Operational["Phones"].notna() &
    New_Fresh_Leads_Operational["Phones"].duplicated(keep=False)
)

# YES/NO summary
print(
    "UEN dup:", "YES" if uen_dup.any() else "NO",
    "| Phone dup:", "YES" if phone_dup.any() else "NO"
)

# Show duplicate rows if exist
if uen_dup.any():
    print("\nüîÅ Duplicate UEN rows:")
    display(New_Fresh_Leads_Operational[uen_dup])

if phone_dup.any():
    print("\nüì± Duplicate Phone rows:")
    display(New_Fresh_Leads_Operational[phone_dup])


In [None]:
New_Fresh_Leads_Operational.shape

In [None]:
New_Fresh_Leads_Operational.head(5)

In [None]:
New_Fresh_Leads_Operational.to_parquet("./Staging/Silver/Silver_data_2_200.parquet", index=False, engine="fastparquet")

In [None]:
# New_Fresh_Leads_Operational.to_csv("New_Fresh_Leads_Operational.csv")