### Data Mining in RecordOwl (Silver 1)

In [1]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


### Ingesting from previous layer

In [2]:
parquet_path = "./Staging/Bronze/bronze_data_1.parquet"
if os.path.exists(parquet_path):
    acra_data_filtered_by_industry = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(acra_data_filtered_by_industry)} rows from {parquet_path}")
    print(acra_data_filtered_by_industry.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

Loaded 500 rows from ./Staging/Bronze/bronze_data_1.parquet
(500, 14)


In [3]:
acra_data_filtered_by_industry = acra_data_filtered_by_industry.head(50)

### Mining RecordOwl

In [4]:
# COST-OPTIMIZED SCRAPER - V10 FIXED (No waitForTimeout)
from apify_client import ApifyClient
import pandas as pd
import time
import json
from requests.exceptions import RequestException

client = ApifyClient("apify_api_OTOzi23olTLbw5NkxeilppwjsaoRHL3zrxRk")

BATCH_SIZE = 50
MAX_CONCURRENCY = 3
MAX_RETRIES = 3

def create_pagefunction_v9_fixed() -> str:
    """V9: Fixed - removed waitForTimeout, using proper Puppeteer methods"""
    return """
async function pageFunction(context) {
    const { page, log, request } = context;
    const uen = request?.userData?.uen || '';

    if (!uen) return { status: 'error', uen: null, error: 'Missing UEN' };

    try {
        // Wait for search results with fallback
        try {
            await page.waitForSelector('.flex-1.min-w-0', { timeout: 20000 });
        } catch (e) {
            await page.waitForSelector('a[href*="/company/"]', { timeout: 10000 });
        }

        // Find and click link with EXACT UEN match
        const linkClickResult = await page.evaluate((targetUen) => {
            const uenUpper = targetUen.toUpperCase();
            const allLinks = document.querySelectorAll('a[href*="/company/"]');

            if (allLinks.length === 0) {
                return { clicked: false, reason: 'No company links found' };
            }

            // First pass: EXACT UEN match
            for (const link of allLinks) {
                let parent = link.parentElement;
                for (let i = 0; i < 5 && parent; i++) {
                    const parentText = (parent.innerText || parent.textContent || '').toUpperCase();
                    const uenPattern = new RegExp('\\\\b' + uenUpper.replace(/[.*+?^${}()|[\\\\]\\\\]/g, '\\\\$&') + '\\\\b');
                    
                    if (uenPattern.test(parentText)) {
                        link.click();
                        return { clicked: true, href: link.getAttribute('href'), matchType: 'exact' };
                    }
                    parent = parent.parentElement;
                }
            }

            // Fallback: Click first result
            allLinks[0].click();
            return { clicked: true, href: allLinks[0].getAttribute('href'), matchType: 'fallback' };
        }, uen);

        if (!linkClickResult.clicked) {
            return { status: 'not_found', uen, error: linkClickResult.reason || 'No search results' };
        }

        // Wait for navigation
        try {
            await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 25000 });
        } catch (navError) {
            const currentUrl = page.url();
            if (!currentUrl.includes('/company/')) {
                return { status: 'error', uen, error: 'Navigation failed' };
            }
        }

        // Wait for content with multiple fallback selectors
        try {
            await page.waitForSelector('dt', { timeout: 15000 });
        } catch (e1) {
            try {
                await page.waitForSelector('dd', { timeout: 10000 });
            } catch (e2) {
                try {
                    await page.waitForSelector('a[href^="mailto:"]', { timeout: 5000 });
                } catch (e3) {
                    // Continue anyway - some pages might not have these elements
                }
            }
        }

        // Verify UEN on the company page
        const uenVerification = await page.evaluate((targetUen) => {
            const uenUpper = targetUen.toUpperCase();
            const pageText = (document.body.innerText || document.body.textContent || '').toUpperCase();
            const uenPattern = new RegExp('\\\\b' + uenUpper.replace(/[.*+?^${}()|[\\\\]\\\\]/g, '\\\\$&') + '\\\\b');
            
            let uenFound = uenPattern.test(pageText);
            let uenInStructure = false;
            
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (dtText.includes('uen') || dtText.includes('registration') || dtText.includes('business registration')) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') {
                        const ddText = dd.textContent.toUpperCase().trim();
                        if (uenPattern.test(ddText)) {
                            uenInStructure = true;
                        }
                    }
                }
            });

            return {
                uenFoundInPage: uenFound || uenInStructure,
                pageUrl: window.location.href
            };
        }, uen);

        // If UEN not found, return mismatch
        if (!uenVerification.uenFoundInPage) {
            return {
                status: 'uen_mismatch',
                uen,
                url: uenVerification.pageUrl,
                error: 'UEN not found on company page'
            };
        }

        // Extract data
        const data = await page.evaluate(() => {
            const SOCIAL_MEDIA_DOMAINS = ['facebook.com','linkedin.com','instagram.com','tiktok.com','twitter.com','x.com','youtube.com','pinterest.com'];
            
            // EMAIL
            const emails = [];
            document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
                const email = a.href.replace('mailto:', '').trim();
                if (email && email.includes('@') && !emails.includes(email)) {
                    emails.push(email);
                }
            });
            
            // PHONE
            const phones = [];
            
            function formatSingaporePhone(text) {
                const digitsOnly = text.replace(/\\D/g, '');
                if (digitsOnly.length === 8) return '+65' + digitsOnly;
                if (digitsOnly.length === 10 && digitsOnly.startsWith('65')) return '+' + digitsOnly;
                if (digitsOnly.length === 11 && digitsOnly.startsWith('65')) return '+65' + digitsOnly.slice(2);
                if (digitsOnly.length >= 10) {
                    for (let i = 0; i <= digitsOnly.length - 10; i++) {
                        if (digitsOnly.slice(i, i+2) === '65' && digitsOnly.length - i >= 10) {
                            return '+' + digitsOnly.slice(i, i+10);
                        }
                    }
                }
                return null;
            }
            
            // tel: links
            document.querySelectorAll('a[href^="tel:"]').forEach(a => {
                const formatted = formatSingaporePhone(a.href.replace('tel:', '').trim());
                if (formatted && !phones.includes(formatted)) phones.push(formatted);
            });
            
            // dt/dd structure
            const phoneKeywords = ['company contact', 'business contact', 'office phone', 'main phone', 'business phone', 'company phone', 'contact number', 'phone', 'tel', 'mobile', 'call', 'contact no'];
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (phoneKeywords.some(kw => dtText.includes(kw))) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') {
                        const formatted = formatSingaporePhone(dd.textContent.trim());
                        if (formatted && !phones.includes(formatted)) phones.push(formatted);
                    }
                }
            });
            
            // Pattern matching
            const bodyText = document.body.innerText || document.body.textContent;
            const phonePatterns = [
                /\\b(\\+65[\\s\\-]?)?([689]\\d{3}[\\s\\-]?\\d{4})\\b/g,
                /\\b65[\\s\\-]?([689]\\d{3})[\\s\\-]?(\\d{4})\\b/g,
                /\\b([689]\\d{3})[\\s\\-](\\d{4})\\b/g
            ];
            phonePatterns.forEach(pattern => {
                const matches = bodyText.matchAll(pattern);
                for (const match of matches) {
                    const formatted = formatSingaporePhone(match[0]);
                    if (formatted && !phones.includes(formatted)) phones.push(formatted);
                }
            });
            
            // WEBSITE
            const websites = [];
            document.querySelectorAll('a[href^="http"]').forEach(a => {
                const href = a.href.trim().toLowerCase();
                if (!SOCIAL_MEDIA_DOMAINS.some(d => href.includes(d)) && !href.includes('recordowl') && !href.includes('apify')) {
                    if (href.match(/\\.(com|sg|net|org|co)/)) websites.push(a.href);
                }
            });
            
            // SOCIAL MEDIA
            const facebook = [], linkedin = [], instagram = [], tiktok = [];
            document.querySelectorAll('a[href*="facebook.com"]').forEach(a => { if (!facebook.includes(a.href)) facebook.push(a.href); });
            document.querySelectorAll('a[href*="linkedin.com"]').forEach(a => { if (!linkedin.includes(a.href)) linkedin.push(a.href); });
            document.querySelectorAll('a[href*="instagram.com"]').forEach(a => { if (!instagram.includes(a.href)) instagram.push(a.href); });
            document.querySelectorAll('a[href*="tiktok.com"]').forEach(a => { if (!tiktok.includes(a.href)) tiktok.push(a.href); });
            
            // ADDRESS
            let address = null;
            const addressLabels = ['registered address', 'registered office address', 'address', 'principal place of business'];
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (addressLabels.some(label => dtText.includes(label))) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') address = dd.textContent.trim();
                }
            });

            return {
                emails: emails.length ? emails : null,
                phones: phones.length ? phones : null,
                website: websites.length ? websites[0] : null,
                facebook: facebook.length ? facebook : null,
                linkedin: linkedin.length ? linkedin : null,
                instagram: instagram.length ? instagram : null,
                tiktok: tiktok.length ? tiktok : null,
                address: address
            };
        });

        return { status: 'success', uen, url: page.url(), ...data };

    } catch (err) {
        return { status: 'error', uen, error: err.message };
    }
}
"""

def run_scraper(client, uens):
    start_urls = [{"url": f"https://recordowl.com/search?name={uen}", "userData": {"uen": uen}} for uen in uens]

    run_input = {
    "startUrls": start_urls,
    "useChrome": False,
    "headless": True,
    "stealth": False,  # CHANGED: Removed stealth overhead
    "pageFunction": create_pagefunction_v9_fixed(),
    "maxRequestRetries": MAX_RETRIES,
    "maxRequestsPerCrawl": len(start_urls),
    "maxConcurrency": MAX_CONCURRENCY,
    "memoryMbytes": 2048,  # NEW: Reduced from 4096 MB
    "pageLoadTimeoutSecs": 25,  # CHANGED: Reduced from 40
    "pageFunctionTimeoutSecs": 60,  # CHANGED: Reduced from 120
    "waitUntil": ["domcontentloaded"],
    "proxyConfiguration": {"useApifyProxy": True},  # CHANGED: Datacenter instead of residential
}

    try:
        run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
        if not run or not isinstance(run, dict) or 'id' not in run:
            return [], "API returned invalid response"
    except Exception as e:
        return [], f"API call failed: {str(e)}"

    try:
        run_client = client.run(run["id"])
        run_info = run_client.wait_for_finish()
        status = run_info.get('status', 'UNKNOWN')

        if status in ['FAILED', 'TIMED-OUT', 'ABORTED']:
            return [], f"Actor run {status}"

        if status == "SUCCEEDED" and "defaultDatasetId" in run:
            dataset = client.dataset(run["defaultDatasetId"])
            items = list(dataset.iterate_items())
            return items, None

        return [], f"Scraping failed with status: {status}"
    except Exception as e:
        return [], f"Run monitoring error: {str(e)}"


# Execute scraper
all_results = []
total_rows = len(acra_data_filtered_by_industry)
total_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Starting scraper - {total_rows} UENs in {total_batches} batches")

for batch_idx in range(0, total_rows, BATCH_SIZE):
    batch = acra_data_filtered_by_industry.iloc[batch_idx:batch_idx + BATCH_SIZE]
    uens = [str(row['UEN']).strip() for _, row in batch.iterrows()]
    
    batch_num = (batch_idx//BATCH_SIZE)+1
    print(f"\nBatch {batch_num}/{total_batches} - {len(uens)} UENs...")
    
    items, error = run_scraper(client, uens)
    
    if error:
        print(f"  Error: {error}")
        for uen in uens:
            all_results.append({
                "UEN": uen, "Status": "error", "Error": error,
                **{k: None for k in ['Emails','Phones','Website','Facebook','LinkedIn','Instagram','TikTok','address','RecordOwl_Link']}
            })
        continue
    
    uen_map = {item.get('uen'): item for item in items if item.get('uen')}
    
    success_count = 0
    for uen in uens:
        item = uen_map.get(uen)
        if not item:
            all_results.append({
                "UEN": uen, "Status": "missing", "Error": "No data returned",
                **{k: None for k in ['Emails','Phones','Website','Facebook','LinkedIn','Instagram','TikTok','address','RecordOwl_Link']}
            })
        else:
            status = item.get('status', 'error')
            if status == 'success':
                success_count += 1
            
            all_results.append({
                'UEN': uen,
                'Status': status,
                'Error': item.get('error'),
                'Emails': item.get('emails'),
                'Phones': item.get('phones'),
                'Website': item.get('website'),
                'Facebook': item.get('facebook'),
                'LinkedIn': item.get('linkedin'),
                'Instagram': item.get('instagram'),
                'TikTok': item.get('tiktok'),
                'address': item.get('address'),
                'RecordOwl_Link': item.get('url')
            })
    
    print(f"  Success: {success_count}/{len(uens)}")
    
    if batch_num < total_batches:
        time.sleep(3)

New_Fresh_Leads = pd.DataFrame(all_results)

Starting scraper - 50 UENs in 1 batches

Batch 1/1 - 50 UENs...


[36m[apify.puppeteer-scraper runId:ALKN0wQb4tGRZf27T][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:ALKN0wQb4tGRZf27T][0m -> 2025-12-04T14:53:31.600Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:ALKN0wQb4tGRZf27T][0m -> 2025-12-04T14:53:31.602Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:ALKN0wQb4tGRZf27T][0m -> 2025-12-04T14:53:31.637Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:ALKN0wQb4tGRZf27T][0m -> 2025-12-04T14:53:31.848Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:ALKN0wQb4tGRZf27T][0m -> 2025-12-04T14:53:33.386Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:ALK

  Success: 50/50


In [6]:
print(f"\n{'='*70}")
print(f"‚úÖ SCRAPING COMPLETE")
print(f"{'='*70}")
print(f"üìä Results Summary:")
print(f"   ‚Ä¢ Total processed: {len(New_Fresh_Leads)}")
print(f"   ‚Ä¢ Successful: {(New_Fresh_Leads['Status']=='success').sum()}")
print(f"   ‚Ä¢ Failed: {(New_Fresh_Leads['Status']=='error').sum()}")
print(f"   ‚Ä¢ Missing: {(New_Fresh_Leads['Status']=='missing').sum()}")
print(f"\nüìû Data Extracted:")
print(f"   ‚Ä¢ Phones: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"   ‚Ä¢ Emails: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"   ‚Ä¢ Websites: {New_Fresh_Leads['Website'].notna().sum()}")
print(f"   ‚Ä¢ Facebook: {New_Fresh_Leads['Facebook'].notna().sum()}")
print(f"   ‚Ä¢ Instagram: {New_Fresh_Leads['Instagram'].notna().sum()}")
print(f"   ‚Ä¢ LinkedIn: {New_Fresh_Leads['LinkedIn'].notna().sum()}")
print(f"   ‚Ä¢ TikTok: {New_Fresh_Leads['TikTok'].notna().sum()}")
print(f"{'='*70}")

New_Fresh_Leads.head(10)


‚úÖ SCRAPING COMPLETE
üìä Results Summary:
   ‚Ä¢ Total processed: 50
   ‚Ä¢ Successful: 50
   ‚Ä¢ Failed: 0
   ‚Ä¢ Missing: 0

üìû Data Extracted:
   ‚Ä¢ Phones: 23
   ‚Ä¢ Emails: 19
   ‚Ä¢ Websites: 18
   ‚Ä¢ Facebook: 21
   ‚Ä¢ Instagram: 20
   ‚Ä¢ LinkedIn: 8
   ‚Ä¢ TikTok: 0


Unnamed: 0,UEN,Status,Error,Emails,Phones,Website,Facebook,LinkedIn,Instagram,TikTok,address,RecordOwl_Link
0,T21LL0056H,success,,,,https://www.creo.sg/,,,[https://www.instagram.com/creostudio.sg/],,101 LORONG 23 GEYLANG #08-01A PROSPER HOUSE 38...,https://recordowl.com/company/creo-design-llp
1,202344755W,success,,[hello@boomglobalnetwork.com],,https://boomglobalnetwork.com/,,,,,453B FERNVALE ROAD #08-513 FERNVALE FLORA 792453,https://recordowl.com/company/boom-global-priv...
2,202232521M,success,,[lstudio17@gmail.com],[+6590688349],https://www.lstudiodesign.net/,,,,,18 SIMEI RISE #09-45 CHANGI RISE CONDOMINIUM 5...,https://recordowl.com/company/lstudio-design-p...
3,201938153M,success,,[admin@triple3interior.com],[+6588589225],,[https://www.facebook.com/Triple3interior/],,[https://www.instagram.com/triple3interior/?hl...,,129 DESKER ROAD #02-01 SINGAPORE 209644,https://recordowl.com/company/triple-3-interio...
4,202119084D,success,,[contactus@architerior.sg],[+6565233854],https://thearchiinterior.com/,"[https://www.facebook.com/architeriorsg/, http...",,[https://www.instagram.com/architerior.sg/],,110 LORONG 23 GEYLANG #06-09 VICTORY CENTRE 38...,https://recordowl.com/company/architerior-pte-ltd
5,202210283W,success,,,,,[https://www.facebook.com/p/JSLim-Design-10009...,,,,281 BUKIT BATOK EAST AVENUE 3 #01-295 650281,https://recordowl.com/company/jslim-design-pte...
6,202313773K,success,,,,,,,,,150 BISHAN STREET 11 #01-153 BISHAN GREEN 570150,https://recordowl.com/company/hoh-interior-des...
7,202236194C,success,,,,https://id-ea.com/,,,,,6 UBI ROAD 1 #01-12 WINTECH CENTRE 408726,https://recordowl.com/company/id-ea-studio-pri...
8,53465827W,success,,,,,,,,,443A BUKIT BATOK WEST AVENUE 8 #10-821 WEST RI...,https://recordowl.com/company/san-thong-interi...
9,53381438C,success,,,,https://donstudio.com/,,[https://www.linkedin.com/company/donstudio],[https://www.instagram.com/studio.don/],,85 FLORA DRIVE #02-48 HEDGES PARK CONDOMINIUM ...,https://recordowl.com/company/don-studio


### DIAGNOSTIC Test

In [None]:
# # DIAGNOSTIC CODE - Run this to identify the issue
# try:
#     print("Testing Apify API connection...")
    
#     # Test 1: Check if API key is valid
#     user_info = client.user().get()
#     print(f"‚úÖ API Key valid - User: {user_info.get('username', 'Unknown')}")
    
#     # Test 2: Check account limits and usage
#     limits = user_info.get('limits', {})
#     print(f"\nüìä Account Status:")
#     print(f"   ‚Ä¢ Plan: {user_info.get('plan', 'Unknown')}")
#     print(f"   ‚Ä¢ Credit balance: ${user_info.get('credit', 'N/A')}")
    
#     # Test 3: Check if the actor exists
#     try:
#         actor_info = client.actor("apify/puppeteer-scraper").get()
#         print(f"\n‚úÖ Actor found: {actor_info.get('name', 'Unknown')}")
#     except Exception as e:
#         print(f"\n‚ùå Actor not found: {e}")
    
#     print("\nüí° If you see JSONDecodeError above, most likely causes:")
#     print("   1. Out of Apify credits")
#     print("   2. Rate limit exceeded")
#     print("   3. Invalid API key")
    
# except Exception as e:
#     print(f"‚ùå API Connection Failed: {type(e).__name__}: {e}")
#     print("\nüí° Check:")
#     print("   1. Is your API key valid?")
#     print("   2. Do you have sufficient credits?")
#     print("   3. Is your network connection stable?")

### Address Formatting 

In [None]:
# Pre-compile patterns for speed
POSTAL_RE = re.compile(r"(?:\bSingapore\b\s*)?(?P<postal>\d{6})(?!\d)", re.IGNORECASE)
UNIT_RES = [
    re.compile(r"#\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}\b", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,5}\b", re.IGNORECASE),
]

def normalize_spaces(text: str) -> str:
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip(" ,;|/")

def extract_postal(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    matches = list(POSTAL_RE.finditer(text))
    if matches:
        m = matches[-1]
        postal = m.group("postal")
        start, end = m.span()
        cleaned = text[:start] + text[end:]
        cleaned = re.sub(r"\bSingapore\b", "", cleaned, flags=re.IGNORECASE)
        return normalize_spaces(cleaned), postal
    return normalize_spaces(text), None

def extract_unit(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    for rx in UNIT_RES:
        m = rx.search(text)
        if m:
            unit_raw = m.group(0)
            cleaned = normalize_spaces(text[:m.start()] + text[m.end():])
            unit_digits = re.sub(r"^unit\s*[#:]?\s*", "", unit_raw, flags=re.IGNORECASE)
            unit_digits = normalize_spaces(unit_digits)
            unit_digits = unit_digits.replace(' ‚Äì ', '-').replace('‚Äì', '-').replace(' ', '')
            unit_digits = unit_digits.lstrip('#')
            return cleaned, unit_digits
    return normalize_spaces(text), None

def clean_street(text: str) -> str | None:
    if not text:
        return None
    text = normalize_spaces(text)
    text = re.sub(r"\s*,\s*", ", ", text)
    return text if text.isupper() else text.title()

def split_address_sg(address: str) -> dict:
    if not isinstance(address, str) or not address.strip():
        return {"street": None, "unit": None, "postal_code": None, "address_clean": None}
    raw = normalize_spaces(address)
    without_postal, postal = extract_postal(raw)
    without_unit, unit = extract_unit(without_postal)
    without_unit = normalize_spaces(re.sub(r"\bSingapore\b", "", without_unit, flags=re.IGNORECASE))
    street = clean_street(without_unit)
    address_clean = normalize_spaces(" ".join(x for x in [street or "", unit or "", f"Singapore {postal}" if postal else ""] if x))
    return {"street": street, "unit": unit, "postal_code": postal, "address_clean": address_clean}

# Apply to current result DF -> create a new dataframe with clean components
if 'address' not in New_Fresh_Leads.columns:
    raise ValueError("Column 'address' not found in New_Fresh_Leads. Run the scraping cell first.")

parsed_df = pd.DataFrame(list(New_Fresh_Leads["address"].apply(split_address_sg)))

# New DataFrame with clean address fields and without raw 'address'
Cleaned_New_Fresh_Leads = New_Fresh_Leads.copy()
if 'address' in Cleaned_New_Fresh_Leads.columns:
    Cleaned_New_Fresh_Leads = Cleaned_New_Fresh_Leads.drop(columns=['address'])
Cleaned_New_Fresh_Leads["operational_street"] = parsed_df["street"]
Cleaned_New_Fresh_Leads["operational_unit"] = parsed_df["unit"]
Cleaned_New_Fresh_Leads["operational_postal_code"] = parsed_df["postal_code"]
Cleaned_New_Fresh_Leads["operational_address"] = parsed_df["address_clean"]

# Save full result to a new DataFrame and display all columns
New_Fresh_Leads_Operational = Cleaned_New_Fresh_Leads.copy()
New_Fresh_Leads_Operational

### Check for duplication of UEN and Phone Number

In [None]:
# Boolean masks for duplicates
uen_dup = New_Fresh_Leads_Operational["UEN"].duplicated(keep=False)

phone_dup = (
    New_Fresh_Leads_Operational["Phones"].notna() &
    New_Fresh_Leads_Operational["Phones"].duplicated(keep=False)
)

# YES/NO summary
print(
    "UEN dup:", "YES" if uen_dup.any() else "NO",
    "| Phone dup:", "YES" if phone_dup.any() else "NO"
)

# Show duplicate rows if exist
if uen_dup.any():
    print("\nüîÅ Duplicate UEN rows:")
    display(New_Fresh_Leads_Operational[uen_dup])

if phone_dup.any():
    print("\nüì± Duplicate Phone rows:")
    display(New_Fresh_Leads_Operational[phone_dup])


### Drop duplicate phone numbers

In [None]:
# # Convert list-like Phones into strings for comparison
# New_Fresh_Leads_Operational["Phones_str"] = (
#     New_Fresh_Leads_Operational["Phones"].astype(str)
# )

# # Create a NEW DataFrame with duplicate phone numbers removed
# New_Fresh_Leads_Operational_unique_phones = (
#     New_Fresh_Leads_Operational.drop_duplicates(
#         subset="Phones_str", keep="first"
#     )
#     .drop(columns=["Phones_str"])  # clean up helper column
# )

# # Show size change
# print("Original:", len(New_Fresh_Leads_Operational))
# print("Unique Phones:", len(New_Fresh_Leads_Operational_unique_phones))


In [None]:
New_Fresh_Leads_Operational_x = New_Fresh_Leads_Operational.copy()

In [None]:
New_Fresh_Leads_Operational_x.to_parquet("./Staging/Silver/Silver_data_2_500_2.parquet", index=False, engine="fastparquet")

In [None]:
# New_Fresh_Leads_Operational.to_csv("New_Fresh_Leads_Operational.csv")