### Data Mining in RecordOwl (Silver 1)

In [1]:
# Standard library
import os
import glob
import re
import time
import asyncio

# Third-party HTTP / async
import requests
import aiohttp
import nest_asyncio

# Data & analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Web scraping
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup

# Fuzzy matching
from fuzzywuzzy import fuzz, process

# Apify
from apify_client import ApifyClient


### Ingesting from previous layer

In [2]:
parquet_path = "./Staging/Bronze/bronze_data_1.parquet"
if os.path.exists(parquet_path):
    acra_data_filtered_by_industry = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(acra_data_filtered_by_industry)} rows from {parquet_path}")
    print(acra_data_filtered_by_industry.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

Loaded 4206 rows from ./Staging/Bronze/bronze_data_1.parquet
(4206, 14)


In [3]:
acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,200613314G,ASCEND INTERNATIONAL TRAINING PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,11/09/2006,85509,88912,GOLDHILL PLAZA,308899,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
1,200708627D,ART BUILDERZ PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,16/05/2007,85509,na,SIN MING LANE,573969,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
2,200715323E,ACP COMPUTER TRAINING SCHOOL PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,20/08/2007,85509,85409,ANG MO KIO STREET 62,569139,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
3,200721644W,ATHLETE DEVELOPMENT PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,21/11/2007,85509,85409,EASTWOOD ROAD,486442,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4,200806301Z,ADDCEL EDU CONSULTANCY (PTE.) LTD.,,LOCAL COMPANY,LIVE COMPANY,01/04/2008,85509,na,HOUGANG STREET 51,530566,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4201,53494545K,ZAN LEARNING STUDIO,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12/11/2024,85509,na,WOODLANDS RING ROAD,730608,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4202,53497327B,ZYNTELLECT ADVISORY,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07/01/2025,85509,na,CHESTNUT AVENUE,679524,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4203,53501816M,ZAVIER TUITION SERVICES,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28/03/2025,85509,na,UPPER THOMSON ROAD,574364,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4204,53515671E,ZENITH CONCEPT MASTERY,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,23/12/2025,85509,na,BUKIT BATOK WEST AVENUE 6,651452,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.


In [4]:
acra_data_filtered_by_industry = acra_data_filtered_by_industry.head(10)

In [5]:
acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,200613314G,ASCEND INTERNATIONAL TRAINING PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,11/09/2006,85509,88912,GOLDHILL PLAZA,308899,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
1,200708627D,ART BUILDERZ PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,16/05/2007,85509,na,SIN MING LANE,573969,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
2,200715323E,ACP COMPUTER TRAINING SCHOOL PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,20/08/2007,85509,85409,ANG MO KIO STREET 62,569139,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
3,200721644W,ATHLETE DEVELOPMENT PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,21/11/2007,85509,85409,EASTWOOD ROAD,486442,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4,200806301Z,ADDCEL EDU CONSULTANCY (PTE.) LTD.,,LOCAL COMPANY,LIVE COMPANY,01/04/2008,85509,na,HOUGANG STREET 51,530566,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
5,200902560M,ALPHA EDUCATION CONSULTANTS PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,12/02/2009,85509,85501,CHOA CHU KANG STREET 54,680768,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
6,200904900G,AUTISM RECOVERY NETWORK (SINGAPORE) PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,19/03/2009,85509,na,JOO CHIAT ROAD,427671,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
7,200910650Z,ABC CENTER PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,12/06/2009,85509,na,SOMERSET ROAD,238164,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
8,200912105K,ACCENDO LEARNERS' HUB PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,04/07/2009,85509,88912,BUKIT BATOK WEST AVENUE 5,651395,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
9,200923394W,AG EDUCATION PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,15/12/2009,85509,na,JALAN AWANG,419644,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.


### Mining RecordOwl

In [6]:
# COST-OPTIMIZED SCRAPER - V10 FIXED (No waitForTimeout)
from apify_client import ApifyClient
import pandas as pd
import time
import json
from requests.exceptions import RequestException
from dotenv import load_dotenv
import os

load_dotenv()

client = ApifyClient(os.getenv("APIFY_API_KEY"))

BATCH_SIZE = 50
MAX_CONCURRENCY = 5
MAX_RETRIES = 3

def create_pagefunction_v9_fixed() -> str:
    """V9: Fixed - removed waitForTimeout, using proper Puppeteer methods"""
    return """
async function pageFunction(context) {
    const { page, log, request } = context;
    const uen = request?.userData?.uen || '';

    if (!uen) return { status: 'error', uen: null, error: 'Missing UEN' };

    try {
        // Wait for search results with fallback
        try {
            await page.waitForSelector('.flex-1.min-w-0', { timeout: 20000 });
        } catch (e) {
            await page.waitForSelector('a[href*="/company/"]', { timeout: 10000 });
        }

        // Find and click link with EXACT UEN match
        const linkClickResult = await page.evaluate((targetUen) => {
            const uenUpper = targetUen.toUpperCase();
            const allLinks = document.querySelectorAll('a[href*="/company/"]');

            if (allLinks.length === 0) {
                return { clicked: false, reason: 'No company links found' };
            }

            // First pass: EXACT UEN match
            for (const link of allLinks) {
                let parent = link.parentElement;
                for (let i = 0; i < 5 && parent; i++) {
                    const parentText = (parent.innerText || parent.textContent || '').toUpperCase();
                    const uenPattern = new RegExp('\\\\b' + uenUpper.replace(/[.*+?^${}()|[\\\\]\\\\]/g, '\\\\$&') + '\\\\b');
                    
                    if (uenPattern.test(parentText)) {
                        link.click();
                        return { clicked: true, href: link.getAttribute('href'), matchType: 'exact' };
                    }
                    parent = parent.parentElement;
                }
            }

            // Fallback: Click first result
            allLinks[0].click();
            return { clicked: true, href: allLinks[0].getAttribute('href'), matchType: 'fallback' };
        }, uen);

        if (!linkClickResult.clicked) {
            return { status: 'not_found', uen, error: linkClickResult.reason || 'No search results' };
        }

        // Wait for navigation
        try {
            await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 25000 });
        } catch (navError) {
            const currentUrl = page.url();
            if (!currentUrl.includes('/company/')) {
                return { status: 'error', uen, error: 'Navigation failed' };
            }
        }

        // Wait for content with multiple fallback selectors
        try {
            await page.waitForSelector('dt', { timeout: 15000 });
        } catch (e1) {
            try {
                await page.waitForSelector('dd', { timeout: 10000 });
            } catch (e2) {
                try {
                    await page.waitForSelector('a[href^="mailto:"]', { timeout: 5000 });
                } catch (e3) {
                    // Continue anyway - some pages might not have these elements
                }
            }
        }

        // Verify UEN on the company page
        const uenVerification = await page.evaluate((targetUen) => {
            const uenUpper = targetUen.toUpperCase();
            const pageText = (document.body.innerText || document.body.textContent || '').toUpperCase();
            const uenPattern = new RegExp('\\\\b' + uenUpper.replace(/[.*+?^${}()|[\\\\]\\\\]/g, '\\\\$&') + '\\\\b');
            
            let uenFound = uenPattern.test(pageText);
            let uenInStructure = false;
            
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (dtText.includes('uen') || dtText.includes('registration') || dtText.includes('business registration')) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') {
                        const ddText = dd.textContent.toUpperCase().trim();
                        if (uenPattern.test(ddText)) {
                            uenInStructure = true;
                        }
                    }
                }
            });

            return {
                uenFoundInPage: uenFound || uenInStructure,
                pageUrl: window.location.href
            };
        }, uen);

        // If UEN not found, return mismatch
        if (!uenVerification.uenFoundInPage) {
            return {
                status: 'uen_mismatch',
                uen,
                url: uenVerification.pageUrl,
                error: 'UEN not found on company page'
            };
        }

        // Extract data
        const data = await page.evaluate(() => {
            const SOCIAL_MEDIA_DOMAINS = ['facebook.com','linkedin.com','instagram.com','tiktok.com','twitter.com','x.com','youtube.com','pinterest.com'];
            
            // EMAIL
            const emails = [];
            document.querySelectorAll('a[href^="mailto:"]').forEach(a => {
                const email = a.href.replace('mailto:', '').trim();
                if (email && email.includes('@') && !emails.includes(email)) {
                    emails.push(email);
                }
            });
            
            // PHONE
            const phones = [];
            
            function formatSingaporePhone(text) {
                const digitsOnly = text.replace(/\\D/g, '');
                if (digitsOnly.length === 8) return '+65' + digitsOnly;
                if (digitsOnly.length === 10 && digitsOnly.startsWith('65')) return '+' + digitsOnly;
                if (digitsOnly.length === 11 && digitsOnly.startsWith('65')) return '+65' + digitsOnly.slice(2);
                if (digitsOnly.length >= 10) {
                    for (let i = 0; i <= digitsOnly.length - 10; i++) {
                        if (digitsOnly.slice(i, i+2) === '65' && digitsOnly.length - i >= 10) {
                            return '+' + digitsOnly.slice(i, i+10);
                        }
                    }
                }
                return null;
            }
            
            // tel: links
            document.querySelectorAll('a[href^="tel:"]').forEach(a => {
                const formatted = formatSingaporePhone(a.href.replace('tel:', '').trim());
                if (formatted && !phones.includes(formatted)) phones.push(formatted);
            });
            
            // dt/dd structure
            const phoneKeywords = ['company contact', 'business contact', 'office phone', 'main phone', 'business phone', 'company phone', 'contact number', 'phone', 'tel', 'mobile', 'call', 'contact no'];
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (phoneKeywords.some(kw => dtText.includes(kw))) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') {
                        const formatted = formatSingaporePhone(dd.textContent.trim());
                        if (formatted && !phones.includes(formatted)) phones.push(formatted);
                    }
                }
            });
            
            // Pattern matching
            const bodyText = document.body.innerText || document.body.textContent;
            const phonePatterns = [
                /\\b(\\+65[\\s\\-]?)?([689]\\d{3}[\\s\\-]?\\d{4})\\b/g,
                /\\b65[\\s\\-]?([689]\\d{3})[\\s\\-]?(\\d{4})\\b/g,
                /\\b([689]\\d{3})[\\s\\-](\\d{4})\\b/g
            ];
            phonePatterns.forEach(pattern => {
                const matches = bodyText.matchAll(pattern);
                for (const match of matches) {
                    const formatted = formatSingaporePhone(match[0]);
                    if (formatted && !phones.includes(formatted)) phones.push(formatted);
                }
            });
            
            // WEBSITE
            const websites = [];
            document.querySelectorAll('a[href^="http"]').forEach(a => {
                const href = a.href.trim().toLowerCase();
                if (!SOCIAL_MEDIA_DOMAINS.some(d => href.includes(d)) && !href.includes('recordowl') && !href.includes('apify')) {
                    if (href.match(/\\.(com|sg|net|org|co)/)) websites.push(a.href);
                }
            });
            
            // SOCIAL MEDIA
            const facebook = [], linkedin = [], instagram = [], tiktok = [];
            document.querySelectorAll('a[href*="facebook.com"]').forEach(a => { if (!facebook.includes(a.href)) facebook.push(a.href); });
            document.querySelectorAll('a[href*="linkedin.com"]').forEach(a => { if (!linkedin.includes(a.href)) linkedin.push(a.href); });
            document.querySelectorAll('a[href*="instagram.com"]').forEach(a => { if (!instagram.includes(a.href)) instagram.push(a.href); });
            document.querySelectorAll('a[href*="tiktok.com"]').forEach(a => { if (!tiktok.includes(a.href)) tiktok.push(a.href); });
            
            // ADDRESS
            let address = null;
            const addressLabels = ['registered address', 'registered office address', 'address', 'principal place of business'];
            document.querySelectorAll('dt').forEach(dt => {
                const dtText = dt.textContent.toLowerCase().trim();
                if (addressLabels.some(label => dtText.includes(label))) {
                    const dd = dt.nextElementSibling;
                    if (dd && dd.tagName === 'DD') address = dd.textContent.trim();
                }
            });

            return {
                emails: emails.length ? emails : null,
                phones: phones.length ? phones : null,
                website: websites.length ? websites[0] : null,
                facebook: facebook.length ? facebook : null,
                linkedin: linkedin.length ? linkedin : null,
                instagram: instagram.length ? instagram : null,
                tiktok: tiktok.length ? tiktok : null,
                address: address
            };
        });

        return { status: 'success', uen, url: page.url(), ...data };

    } catch (err) {
        return { status: 'error', uen, error: err.message };
    }
}
"""

def run_scraper(client, uens):
    start_urls = [{"url": f"https://recordowl.com/search?name={uen}", "userData": {"uen": uen}} for uen in uens]

    run_input = {
        "startUrls": start_urls,
        "useChrome": False,
        "headless": True,
        "pageFunction": create_pagefunction_v9_fixed(),
        "maxRequestRetries": MAX_RETRIES,
        "maxPagesPerCrawl": len(start_urls),
        "maxConcurrency": MAX_CONCURRENCY,
        "pageLoadTimeoutSecs": 25,
        "pageFunctionTimeoutSecs": 60,
        "waitUntil": ["domcontentloaded"],
        "proxyConfiguration": {"useApifyProxy": True},
        "downloadMedia": False,
        "downloadCss": False,
    }

    try:
        run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
        if not run or not isinstance(run, dict) or 'id' not in run:
            return [], "API returned invalid response"
    except Exception as e:
        return [], f"API call failed: {str(e)}"

    try:
        run_client = client.run(run["id"])
        run_info = run_client.wait_for_finish()
        status = run_info.get('status', 'UNKNOWN')

        if status in ['FAILED', 'TIMED-OUT', 'ABORTED']:
            return [], f"Actor run {status}"

        if status == "SUCCEEDED" and "defaultDatasetId" in run:
            dataset = client.dataset(run["defaultDatasetId"])
            items = list(dataset.iterate_items())
            return items, None

        return [], f"Scraping failed with status: {status}"
    except Exception as e:
        return [], f"Run monitoring error: {str(e)}"


# Execute scraper
all_results = []
total_rows = len(acra_data_filtered_by_industry)
total_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Starting scraper - {total_rows} UENs in {total_batches} batches")

for batch_idx in range(0, total_rows, BATCH_SIZE):
    batch = acra_data_filtered_by_industry.iloc[batch_idx:batch_idx + BATCH_SIZE]
    uens = [str(row['UEN']).strip() for _, row in batch.iterrows()]
    
    batch_num = (batch_idx//BATCH_SIZE)+1
    print(f"\nBatch {batch_num}/{total_batches} - {len(uens)} UENs...")
    
    items, error = run_scraper(client, uens)
    
    if error:
        print(f"  Error: {error}")
        for uen in uens:
            all_results.append({
                "UEN": uen, "Status": "error", "Error": error,
                **{k: None for k in ['Emails','Phones','Website','Facebook','LinkedIn','Instagram','TikTok','address','RecordOwl_Link']}
            })
        continue
    
    uen_map = {item.get('uen'): item for item in items if item.get('uen')}
    
    success_count = 0
    for uen in uens:
        item = uen_map.get(uen)
        if not item:
            all_results.append({
                "UEN": uen, "Status": "missing", "Error": "No data returned",
                **{k: None for k in ['Emails','Phones','Website','Facebook','LinkedIn','Instagram','TikTok','address','RecordOwl_Link']}
            })
        else:
            status = item.get('status', 'error')
            if status == 'success':
                success_count += 1
            
            all_results.append({
                'UEN': uen,
                'Status': status,
                'Error': item.get('error'),
                'Emails': item.get('emails'),
                'Phones': item.get('phones'),
                'Website': item.get('website'),
                'Facebook': item.get('facebook'),
                'LinkedIn': item.get('linkedin'),
                'Instagram': item.get('instagram'),
                'TikTok': item.get('tiktok'),
                'address': item.get('address'),
                'RecordOwl_Link': item.get('url')
            })
    
    print(f"  Success: {success_count}/{len(uens)}")
    
    if batch_num < total_batches:
        time.sleep(3)

New_Fresh_Leads = pd.DataFrame(all_results)

Starting scraper - 10 UENs in 1 batches

Batch 1/1 - 10 UENs...


[36m[apify.puppeteer-scraper runId:Tvs0cpqb40exBorWJ][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:Tvs0cpqb40exBorWJ][0m -> 2026-02-18T15:00:22.516Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:Tvs0cpqb40exBorWJ][0m -> 2026-02-18T15:00:22.519Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:Tvs0cpqb40exBorWJ][0m -> 2026-02-18T15:00:22.941Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:Tvs0cpqb40exBorWJ][0m -> 2026-02-18T15:00:23.198Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:Tvs0cpqb40exBorWJ][0m -> 2026-02-18T15:00:26.027Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:Tvs

  Success: 10/10


In [7]:
print(f"\n{'='*70}")
print(f"‚úÖ SCRAPING COMPLETE")
print(f"{'='*70}")
print(f"üìä Results Summary:")
print(f"   ‚Ä¢ Total processed: {len(New_Fresh_Leads)}")
print(f"   ‚Ä¢ Successful: {(New_Fresh_Leads['Status']=='success').sum()}")
print(f"   ‚Ä¢ Failed: {(New_Fresh_Leads['Status']=='error').sum()}")
print(f"   ‚Ä¢ Missing: {(New_Fresh_Leads['Status']=='missing').sum()}")
print(f"\nüìû Data Extracted:")
print(f"   ‚Ä¢ Phones: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"   ‚Ä¢ Emails: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"   ‚Ä¢ Websites: {New_Fresh_Leads['Website'].notna().sum()}")
print(f"   ‚Ä¢ Facebook: {New_Fresh_Leads['Facebook'].notna().sum()}")
print(f"   ‚Ä¢ Instagram: {New_Fresh_Leads['Instagram'].notna().sum()}")
print(f"   ‚Ä¢ LinkedIn: {New_Fresh_Leads['LinkedIn'].notna().sum()}")
print(f"   ‚Ä¢ TikTok: {New_Fresh_Leads['TikTok'].notna().sum()}")
print(f"{'='*70}")

New_Fresh_Leads.head(10)


‚úÖ SCRAPING COMPLETE
üìä Results Summary:
   ‚Ä¢ Total processed: 10
   ‚Ä¢ Successful: 10
   ‚Ä¢ Failed: 0
   ‚Ä¢ Missing: 0

üìû Data Extracted:
   ‚Ä¢ Phones: 7
   ‚Ä¢ Emails: 6
   ‚Ä¢ Websites: 8
   ‚Ä¢ Facebook: 4
   ‚Ä¢ Instagram: 4
   ‚Ä¢ LinkedIn: 3
   ‚Ä¢ TikTok: 0


Unnamed: 0,UEN,Status,Error,Emails,Phones,Website,Facebook,LinkedIn,Instagram,TikTok,address,RecordOwl_Link
0,200613314G,success,,,,http://atspace.sg/,,,,,1 GOLDHILL PLAZA #02-31 GOLDHILL PLAZA SINGAPO...,https://recordowl.com/company/ascend-internati...
1,200708627D,success,,[info@artbuilderz.com],[+6589318520],https://artbuilderz.com/,[https://www.facebook.com/Artbuilderz.sg],,[https://www.instagram.com/artbuilderz/],,22 SIN MING LANE #6-76 MIDVIEW CITY Singapore ...,https://recordowl.com/company/art-i-gallery-pt...
2,200715323E,success,,[hr_dept@acpcomputer.edu.sg],"[+6562277996, +6562279672]",https://acpcomputer.com/,,[https://sg.linkedin.com/in/acp-computer-61272...,[https://www.instagram.com/acpcomputersg/],,3 ANG MO KIO STREET 62 #06-18 LINK@AMK SINGAPO...,https://recordowl.com/company/acp-computer-tra...
3,200721644W,success,,[team@controllables.sg],[+6592214705],https://athlete.sg/,,,,,20 EASTWOOD ROAD #03-13 EASTWOOD CENTRE SINGAP...,https://recordowl.com/company/athlete-developm...
4,200806301Z,success,,[enquiries@addceledu.tech],[+6565620138],https://addcel.com.sg/,,,,,566 HOUGANG STREET 51 #06-456 SINGAPORE 530566,https://recordowl.com/company/addcel-edu-consu...
5,200902560M,success,,,,,,,,,768 CHOA CHU KANG STREET 54 #10-33 SINGAPORE 6...,https://recordowl.com/company/alpha-education-...
6,200904900G,success,,[info@autismrecovery.sg],[+6563488005],https://autismrecovery.sg/,[https://www.facebook.com/autism.singapore],[https://www.linkedin.com/company/autism-recov...,[https://www.instagram.com/arn.sg/],,458A JOO CHIAT ROAD SINGAPORE 427671,https://recordowl.com/company/autism-recovery-...
7,200910650Z,success,,[admin@abccentersingapore.com],[+6594236248],https://www.mycareersfuture.gov.sg/job/educati...,[https://www.facebook.com/p/Applied-Behavior-C...,[https://sg.linkedin.com/company/applied-behav...,[https://www.instagram.com/abc_center_singapore/],,111 SOMERSET ROAD #04-03 111 SOMERSET SINGAPOR...,https://recordowl.com/company/abc-center-pte-ltd
8,200912105K,success,,,[+6565153969],https://www.ascendo.sg/,[https://www.facebook.com/p/Ascendo-Academy-Pt...,,,,395A BUKIT BATOK WEST AVENUE 5 #03-04 GOODVIEW...,https://recordowl.com/company/accendo-learners...
9,200923394W,success,,,,,,,,,1A JALAN AWANG CHUAN VILLAS SINGAPORE 419644,https://recordowl.com/company/ag-education-pte...


### Address Formatting 

In [8]:
# Pre-compile patterns for speed
POSTAL_RE = re.compile(r"(?:\bSingapore\b\s*)?(?P<postal>\d{6})(?!\d)", re.IGNORECASE)
UNIT_RES = [
    re.compile(r"#\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,4}\s*[-‚Äì]\s*[A-Za-z0-9]{1,4}\b", re.IGNORECASE),
    re.compile(r"\bunit\s*[#:]?\s*[A-Za-z0-9]{1,5}\b", re.IGNORECASE),
]

def normalize_spaces(text: str) -> str:
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip(" ,;|/")

def extract_postal(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    matches = list(POSTAL_RE.finditer(text))
    if matches:
        m = matches[-1]
        postal = m.group("postal")
        start, end = m.span()
        cleaned = text[:start] + text[end:]
        cleaned = re.sub(r"\bSingapore\b", "", cleaned, flags=re.IGNORECASE)
        return normalize_spaces(cleaned), postal
    return normalize_spaces(text), None

def extract_unit(text: str) -> tuple[str, str | None]:
    if not text:
        return text, None
    for rx in UNIT_RES:
        m = rx.search(text)
        if m:
            unit_raw = m.group(0)
            cleaned = normalize_spaces(text[:m.start()] + text[m.end():])
            unit_digits = re.sub(r"^unit\s*[#:]?\s*", "", unit_raw, flags=re.IGNORECASE)
            unit_digits = normalize_spaces(unit_digits)
            unit_digits = unit_digits.replace(' ‚Äì ', '-').replace('‚Äì', '-').replace(' ', '')
            unit_digits = unit_digits.lstrip('#')
            return cleaned, unit_digits
    return normalize_spaces(text), None

def clean_street(text: str) -> str | None:
    if not text:
        return None
    text = normalize_spaces(text)
    text = re.sub(r"\s*,\s*", ", ", text)
    return text if text.isupper() else text.title()

def split_address_sg(address: str) -> dict:
    if not isinstance(address, str) or not address.strip():
        return {"street": None, "unit": None, "postal_code": None, "address_clean": None}
    raw = normalize_spaces(address)
    without_postal, postal = extract_postal(raw)
    without_unit, unit = extract_unit(without_postal)
    without_unit = normalize_spaces(re.sub(r"\bSingapore\b", "", without_unit, flags=re.IGNORECASE))
    street = clean_street(without_unit)
    address_clean = normalize_spaces(" ".join(x for x in [street or "", unit or "", f"Singapore {postal}" if postal else ""] if x))
    return {"street": street, "unit": unit, "postal_code": postal, "address_clean": address_clean}

# Apply to current result DF -> create a new dataframe with clean components
if 'address' not in New_Fresh_Leads.columns:
    raise ValueError("Column 'address' not found in New_Fresh_Leads. Run the scraping cell first.")

parsed_df = pd.DataFrame(list(New_Fresh_Leads["address"].apply(split_address_sg)))

# New DataFrame with clean address fields and without raw 'address'
Cleaned_New_Fresh_Leads = New_Fresh_Leads.copy()
if 'address' in Cleaned_New_Fresh_Leads.columns:
    Cleaned_New_Fresh_Leads = Cleaned_New_Fresh_Leads.drop(columns=['address'])
Cleaned_New_Fresh_Leads["operational_street"] = parsed_df["street"]
Cleaned_New_Fresh_Leads["operational_unit"] = parsed_df["unit"]
Cleaned_New_Fresh_Leads["operational_postal_code"] = parsed_df["postal_code"]
Cleaned_New_Fresh_Leads["operational_address"] = parsed_df["address_clean"]

# Save full result to a new DataFrame and display all columns
New_Fresh_Leads_Operational = Cleaned_New_Fresh_Leads.copy()
New_Fresh_Leads_Operational

Unnamed: 0,UEN,Status,Error,Emails,Phones,Website,Facebook,LinkedIn,Instagram,TikTok,RecordOwl_Link,operational_street,operational_unit,operational_postal_code,operational_address
0,200613314G,success,,,,http://atspace.sg/,,,,,https://recordowl.com/company/ascend-internati...,1 GOLDHILL PLAZA GOLDHILL PLAZA,02-31,308899,1 GOLDHILL PLAZA GOLDHILL PLAZA 02-31 Singapor...
1,200708627D,success,,[info@artbuilderz.com],[+6589318520],https://artbuilderz.com/,[https://www.facebook.com/Artbuilderz.sg],,[https://www.instagram.com/artbuilderz/],,https://recordowl.com/company/art-i-gallery-pt...,22 SIN MING LANE MIDVIEW CITY,6-76,573969,22 SIN MING LANE MIDVIEW CITY 6-76 Singapore 5...
2,200715323E,success,,[hr_dept@acpcomputer.edu.sg],"[+6562277996, +6562279672]",https://acpcomputer.com/,,[https://sg.linkedin.com/in/acp-computer-61272...,[https://www.instagram.com/acpcomputersg/],,https://recordowl.com/company/acp-computer-tra...,3 ANG MO KIO STREET 62 LINK@AMK,06-18,569139,3 ANG MO KIO STREET 62 LINK@AMK 06-18 Singapor...
3,200721644W,success,,[team@controllables.sg],[+6592214705],https://athlete.sg/,,,,,https://recordowl.com/company/athlete-developm...,20 EASTWOOD ROAD EASTWOOD CENTRE,03-13,486442,20 EASTWOOD ROAD EASTWOOD CENTRE 03-13 Singapo...
4,200806301Z,success,,[enquiries@addceledu.tech],[+6565620138],https://addcel.com.sg/,,,,,https://recordowl.com/company/addcel-edu-consu...,566 HOUGANG STREET 51,06-456,530566,566 HOUGANG STREET 51 06-456 Singapore 530566
5,200902560M,success,,,,,,,,,https://recordowl.com/company/alpha-education-...,768 CHOA CHU KANG STREET 54,10-33,680768,768 CHOA CHU KANG STREET 54 10-33 Singapore 68...
6,200904900G,success,,[info@autismrecovery.sg],[+6563488005],https://autismrecovery.sg/,[https://www.facebook.com/autism.singapore],[https://www.linkedin.com/company/autism-recov...,[https://www.instagram.com/arn.sg/],,https://recordowl.com/company/autism-recovery-...,458A JOO CHIAT ROAD,,427671,458A JOO CHIAT ROAD Singapore 427671
7,200910650Z,success,,[admin@abccentersingapore.com],[+6594236248],https://www.mycareersfuture.gov.sg/job/educati...,[https://www.facebook.com/p/Applied-Behavior-C...,[https://sg.linkedin.com/company/applied-behav...,[https://www.instagram.com/abc_center_singapore/],,https://recordowl.com/company/abc-center-pte-ltd,111 SOMERSET ROAD 111 SOMERSET,04-03,238164,111 SOMERSET ROAD 111 SOMERSET 04-03 Singapore...
8,200912105K,success,,,[+6565153969],https://www.ascendo.sg/,[https://www.facebook.com/p/Ascendo-Academy-Pt...,,,,https://recordowl.com/company/accendo-learners...,395A BUKIT BATOK WEST AVENUE 5 GOODVIEW GARDENS,03-04,651395,395A BUKIT BATOK WEST AVENUE 5 GOODVIEW GARDEN...
9,200923394W,success,,,,,,,,,https://recordowl.com/company/ag-education-pte...,1A JALAN AWANG CHUAN VILLAS,,419644,1A JALAN AWANG CHUAN VILLAS Singapore 419644


### Check for duplication of UEN and Phone Number

In [9]:
# Boolean masks for duplicates
uen_dup = New_Fresh_Leads_Operational["UEN"].duplicated(keep=False)

phone_dup = (
    New_Fresh_Leads_Operational["Phones"].notna() &
    New_Fresh_Leads_Operational["Phones"].duplicated(keep=False)
)

# YES/NO summary
print(
    "UEN dup:", "YES" if uen_dup.any() else "NO",
    "| Phone dup:", "YES" if phone_dup.any() else "NO"
)

# Show duplicate rows if exist
if uen_dup.any():
    print("\nüîÅ Duplicate UEN rows:")
    display(New_Fresh_Leads_Operational[uen_dup])

if phone_dup.any():
    print("\nüì± Duplicate Phone rows:")
    display(New_Fresh_Leads_Operational[phone_dup])


UEN dup: NO | Phone dup: NO


### Drop duplicate phone numbers

In [10]:
# Convert list-like Phones into strings for comparison
New_Fresh_Leads_Operational["Phones_str"] = (
    New_Fresh_Leads_Operational["Phones"].astype(str)
)

# Create a NEW DataFrame with duplicate phone numbers removed
New_Fresh_Leads_Operational_unique_phones = (
    New_Fresh_Leads_Operational.drop_duplicates(
        subset="Phones_str", keep="first"
    )
    .drop(columns=["Phones_str"])  # clean up helper column
)

# Show size change
print("Original:", len(New_Fresh_Leads_Operational))
print("Unique Phones:", len(New_Fresh_Leads_Operational_unique_phones))


Original: 10
Unique Phones: 8


In [11]:
New_Fresh_Leads_Operational["PIC Source 1"] = None
New_Fresh_Leads_Operational["PIC Source 2"] = None
New_Fresh_Leads_Operational["PIC Source 3"] = None
New_Fresh_Leads_Operational = New_Fresh_Leads_Operational[["UEN","Phones", "Emails", "Website", "Facebook", "LinkedIn", "Instagram",  "TikTok", "operational_street", "operational_unit", "operational_postal_code", "operational_address", "PIC Source 1", "PIC Source 2", "PIC Source 3"]]

In [12]:
# Assign PIC Source 1 as "RecordOwl" if Phones has any value
New_Fresh_Leads_Operational.loc[(New_Fresh_Leads_Operational["Phones"].notna()) & (New_Fresh_Leads_Operational["Phones"] != ""), "PIC Source 1"] = "RecordOwl"
New_Fresh_Leads_Operational

Unnamed: 0,UEN,Phones,Emails,Website,Facebook,LinkedIn,Instagram,TikTok,operational_street,operational_unit,operational_postal_code,operational_address,PIC Source 1,PIC Source 2,PIC Source 3
0,200613314G,,,http://atspace.sg/,,,,,1 GOLDHILL PLAZA GOLDHILL PLAZA,02-31,308899,1 GOLDHILL PLAZA GOLDHILL PLAZA 02-31 Singapor...,,,
1,200708627D,[+6589318520],[info@artbuilderz.com],https://artbuilderz.com/,[https://www.facebook.com/Artbuilderz.sg],,[https://www.instagram.com/artbuilderz/],,22 SIN MING LANE MIDVIEW CITY,6-76,573969,22 SIN MING LANE MIDVIEW CITY 6-76 Singapore 5...,RecordOwl,,
2,200715323E,"[+6562277996, +6562279672]",[hr_dept@acpcomputer.edu.sg],https://acpcomputer.com/,,[https://sg.linkedin.com/in/acp-computer-61272...,[https://www.instagram.com/acpcomputersg/],,3 ANG MO KIO STREET 62 LINK@AMK,06-18,569139,3 ANG MO KIO STREET 62 LINK@AMK 06-18 Singapor...,RecordOwl,,
3,200721644W,[+6592214705],[team@controllables.sg],https://athlete.sg/,,,,,20 EASTWOOD ROAD EASTWOOD CENTRE,03-13,486442,20 EASTWOOD ROAD EASTWOOD CENTRE 03-13 Singapo...,RecordOwl,,
4,200806301Z,[+6565620138],[enquiries@addceledu.tech],https://addcel.com.sg/,,,,,566 HOUGANG STREET 51,06-456,530566,566 HOUGANG STREET 51 06-456 Singapore 530566,RecordOwl,,
5,200902560M,,,,,,,,768 CHOA CHU KANG STREET 54,10-33,680768,768 CHOA CHU KANG STREET 54 10-33 Singapore 68...,,,
6,200904900G,[+6563488005],[info@autismrecovery.sg],https://autismrecovery.sg/,[https://www.facebook.com/autism.singapore],[https://www.linkedin.com/company/autism-recov...,[https://www.instagram.com/arn.sg/],,458A JOO CHIAT ROAD,,427671,458A JOO CHIAT ROAD Singapore 427671,RecordOwl,,
7,200910650Z,[+6594236248],[admin@abccentersingapore.com],https://www.mycareersfuture.gov.sg/job/educati...,[https://www.facebook.com/p/Applied-Behavior-C...,[https://sg.linkedin.com/company/applied-behav...,[https://www.instagram.com/abc_center_singapore/],,111 SOMERSET ROAD 111 SOMERSET,04-03,238164,111 SOMERSET ROAD 111 SOMERSET 04-03 Singapore...,RecordOwl,,
8,200912105K,[+6565153969],,https://www.ascendo.sg/,[https://www.facebook.com/p/Ascendo-Academy-Pt...,,,,395A BUKIT BATOK WEST AVENUE 5 GOODVIEW GARDENS,03-04,651395,395A BUKIT BATOK WEST AVENUE 5 GOODVIEW GARDEN...,RecordOwl,,
9,200923394W,,,,,,,,1A JALAN AWANG CHUAN VILLAS,,419644,1A JALAN AWANG CHUAN VILLAS Singapore 419644,,,


In [13]:
New_Fresh_Leads_Operational.to_parquet("./Staging/Silver/Silver_data_1.parquet", index=False, engine="fastparquet")