### Data Mining in Website (Silver 3)

In [60]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


In [61]:
parquet_path = "./Staging/Silver/df_with_websites.parquet"
if os.path.exists(parquet_path):
    RecordOwl_Leads = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(RecordOwl_Leads)} rows from {parquet_path}")
    print(RecordOwl_Leads.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")


RecordOwl_Leads.tail(10)

Loaded 75 rows from ./Staging/Silver/df_with_websites.parquet
(75, 16)


Unnamed: 0,UEN,Status,Error,Emails,Phones,Website,Facebook,LinkedIn,Instagram,TikTok,RecordOwl_Link,operational_street,operational_unit,operational_postal_code,operational_address,Phones_str
65,53434305L,success,,[ecodieseldynamics@gmail.com],,https://ecodieseldynamics.com/diesel-parts/,https://www.facebook.com/ecodieseldynamics,,,,https://recordowl.com/company/eco-diesel-auto-...,48 WHAMPOA EAST EIGHT RIVERSUITES,,338540,48 WHAMPOA EAST EIGHT RIVERSUITES Singapore 33...,
66,202012242C,success,,[tommy@jssmautosolutions.com],,https://www.mycareersfuture.gov.sg/job/manufac...,https://www.facebook.com/jssmautosolutions/,,,,https://recordowl.com/company/jssm-autosolutio...,8 KAKI BUKIT AVENUE 4 PREMIER @ KAKI BUKIT,02-13,415875,8 KAKI BUKIT AVENUE 4 PREMIER @ KAKI BUKIT 02-...,
67,202227809C,success,,,,https://g7workshop.sg/,https://www.facebook.com/g7workshopsg,,,,https://recordowl.com/company/g7-workshop-priv...,1 CORPORATION DRIVE /19,06-18,619775,1 CORPORATION DRIVE /19 06-18 Singapore 619775,
68,53482117X,success,,[jason@9kmotorsports.com],,https://9kmotors.com/,https://www.facebook.com/61568245975495,,"[https://www.instagram.com/9kracing, https://i...",,https://recordowl.com/company/9k-motorsports,401E FERNVALE LANE,08-328,795401,401E FERNVALE LANE 08-328 Singapore 795401,
69,53468383E,success,,,,https://genushair.com/en/,https://www.facebook.com/genushairitalia,,[https://www.instagram.com/genushair_official/],,https://recordowl.com/company/genus-color,39 WOODLANDS CLOSE MEGA@WOODLANDS,03-04,737856,39 WOODLANDS CLOSE MEGA@WOODLANDS 03-04 Singap...,
70,202343910E,success,,[sales@rtac-consulting-engineering.com],,https://rtac-consulting-engineering.com/,https://www.facebook.com/rtacconsulting/,,,,https://recordowl.com/company/rtac-pte-ltd,387 YISHUN RING ROAD SKY GREEN,02-1665,760387,387 YISHUN RING ROAD SKY GREEN 02-1665 Singapo...,
71,53468738E,success,,[Info@freshautocare.co.uk],,https://freshautocare.co.uk/,https://www.facebook.com/freshautocareuk/,,,,https://recordowl.com/company/fresh-auto-care,68 KAKI BUKIT AVENUE 6 ARK@KB,03-03,417896,68 KAKI BUKIT AVENUE 6 ARK@KB 03-03 Singapore ...,
72,202342246K,success,,,,https://www.trans-tec.com/,https://www.facebook.com/profile.php?id=615658...,,,,https://recordowl.com/company/trans-tech-speci...,18 SIN MING LANE MIDVIEW CITY,08-06,573960,18 SIN MING LANE MIDVIEW CITY 08-06 Singapore ...,
73,202514392K,success,,[info@masev-designs.com],,https://masev-designs.com/contactus.html,https://www.facebook.com/masev3d/,,,,https://recordowl.com/company/masev-pte-ltd,3791 JALAN BUKIT MERAH E-CENTRE @ REDHILL,08-12,159471,3791 JALAN BUKIT MERAH E-CENTRE @ REDHILL 08-1...,
74,202500469Z,success,,,,https://detailogy.co.uk/,https://www.facebook.com/detailogy/,,,,https://recordowl.com/company/detailogy-privat...,1 BUKIT BATOK CRESCENT WCEGA PLAZA,02-45,658064,1 BUKIT BATOK CRESCENT WCEGA PLAZA 02-45 Singa...,


In [62]:
# --- Split rows with Singapore-based hosting or known SG sources ---

# List of known SG platforms to check
sg_keywords = [
    "mycareersfuture",
    "recordowl",
    "bizfile",
]

# Ensure Website column is string to avoid errors
RecordOwl_Leads["Website"] = RecordOwl_Leads["Website"].astype(str)

# Build regex pattern
pattern = "|".join(sg_keywords)

# 1Ô∏è‚É£ Rows that match SG keywords (TRANSFER OUT)
sg_company_sites = RecordOwl_Leads[
    RecordOwl_Leads["Website"].str.contains(pattern, case=False, na=False)
]

# 2Ô∏è‚É£ Rows that DO NOT match SG keywords (REMAINING DATA)
RecordOwl_Leads = RecordOwl_Leads[
    ~RecordOwl_Leads["Website"].str.contains(pattern, case=False, na=False)
]

print("Transferred to sg_company_sites:", len(sg_company_sites))
print("Remaining rows in RecordOwl_Leads:", len(RecordOwl_Leads))


Transferred to sg_company_sites: 20
Remaining rows in RecordOwl_Leads: 55


In [63]:
RecordOwl_Leads.shape

(55, 16)

In [64]:
RecordOwl_Leads.shape

(55, 16)

In [65]:
# --- Initialize Apify client ---
APIFY_TOKEN = os.getenv("APIFY_TOKEN", "apify_api_gak2ulhepgd4uzBseSLQtiHnb9KGxy3iMwp2")
client = ApifyClient(APIFY_TOKEN)

# COST-OPTIMIZED BATCH CONFIGURATION
BATCH_SIZE = 50          # Process 50 websites per batch
MAX_CONCURRENCY = 3      # 3 concurrent browsers for stability
MAX_RETRIES = 2

def create_website_scraper_pagefunction():
    """Optimized pageFunction for extracting phone numbers - tries contact page first, falls back to home page"""
    return """
async function pageFunction(context) {
    const { page, log, request } = context;
    const website = request.url;
    const isContact = request.userData?.isContact || false;
    const isHomepage = request.userData?.isHomepage || false;

    log.info(`üîç Scraping: ${website}`);

    try {
        // STEP 1: If on main page, try to find contact page
        if (!isContact && !isHomepage) {
            // Wait for page to load
            await page.waitForSelector('a', { timeout: 10000 }).catch(() => null);

            // Find contact page link
            const contactUrl = await page.evaluate(() => {
                const links = Array.from(document.querySelectorAll('a[href]'));
                for (const link of links) {
                    const href = link.getAttribute('href');
                    const text = link.textContent.toLowerCase();
                    if ((href && href.toLowerCase().includes('contact')) ||
                        text.includes('contact') || text.includes('about')) {
                        const fullUrl = href.startsWith('http') ? href :
                                       href.startsWith('/') ? window.location.origin + href :
                                       window.location.origin + '/' + href;
                        return fullUrl;
                    }
                }
                return null;
            });

            if (contactUrl) {
                // Found contact page - navigate to it
                await context.enqueueRequest({
                    url: contactUrl,
                    userData: { isContact: true, originalUrl: website }
                });
                log.info(`‚úÖ Enqueued contact page: ${contactUrl}`);
                return null;
            } else {
                // No contact page found - scrape homepage instead
                log.info(`‚ö†Ô∏è No contact page found, scraping homepage: ${website}`);
                // Continue to extraction below (don't return)
            }
        }

        // STEP 2: Extract phone numbers (either from contact page or homepage)
        await page.waitForSelector('body', { timeout: 10000 });

        const contactData = await page.evaluate(() => {
            function formatSingaporePhone(text) {
                const digitsOnly = text.replace(/\\D/g, '');

                if (digitsOnly.length === 8) {
                    return '+65' + digitsOnly;
                }
                if (digitsOnly.length === 10 && digitsOnly.startsWith('65')) {
                    return '+' + digitsOnly;
                }
                if (digitsOnly.length === 11 && digitsOnly.startsWith('65')) {
                    return '+65' + digitsOnly.slice(2);
                }
                return null;
            }

            const phones = [];

            // Method 1: Extract from tel: links
            document.querySelectorAll('a[href^="tel:"]').forEach(a => {
                const formatted = formatSingaporePhone(a.href.replace('tel:', '').trim());
                if (formatted && !phones.includes(formatted)) {
                    phones.push(formatted);
                }
            });

            // Method 2: Pattern matching in body text
            const bodyText = document.body.innerText || document.body.textContent;
            const phonePatterns = [
                /\\b(\\+65[\\s\\-]?)?([689]\\d{3}[\\s\\-]?\\d{4})\\b/g,
                /\\b65[\\s\\-]?([689]\\d{3})[\\s\\-]?(\\d{4})\\b/g,
            ];

            phonePatterns.forEach(pattern => {
                const matches = bodyText.matchAll(pattern);
                for (const match of matches) {
                    const formatted = formatSingaporePhone(match[0]);
                    if (formatted && !phones.includes(formatted)) {
                        phones.push(formatted);
                    }
                }
            });

            return { phones: [...new Set(phones)] };
        });

        const pageType = isContact ? 'contact page' : 'homepage';
        log.info(`‚úÖ Found ${contactData.phones.length} phone(s) on ${pageType}: ${website}`);

        return {
            website: request.userData?.originalUrl || website,
            contactUrl: request.url,
            phones: contactData.phones.length ? contactData.phones : null,
            pageType: pageType,
            status: 'success'
        };

    } catch (err) {
        log.error(`‚ùå Error scraping ${website}: ${err.message}`);
        return {
            website: request.userData?.originalUrl || website,
            phones: null,
            pageType: 'unknown',
            status: 'error',
            error: err.message
        };
    }
}
"""

def run_website_scraper(client, websites):
    """Run Apify scraper for a batch of websites"""
    start_urls = [{"url": website, "userData": {"originalUrl": website}} for website in websites]

    print(f"  üìã Processing {len(start_urls)} websites in single actor run")

    run_input = {
        "startUrls": start_urls,
        "useChrome": False,              # Use Chromium (lighter)
        "headless": True,
        "stealth": True,
        "pageFunction": create_website_scraper_pagefunction(),
        "maxRequestRetries": MAX_RETRIES,
        "maxRequestsPerCrawl": len(start_urls) * 2,  # Account for main + contact pages
        "maxConcurrency": MAX_CONCURRENCY,
        "pageLoadTimeoutSecs": 30,
        "pageFunctionTimeoutSecs": 60,
        "waitUntil": ["domcontentloaded"],
        "proxyConfiguration": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"]
        },
        "proxyRotation": "RECOMMENDED",
    }

    print(f"  üöÄ Launching Apify actor with {MAX_CONCURRENCY} concurrent browsers...")

    try:
        run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)

        if not run or not isinstance(run, dict) or 'id' not in run:
            return [], f"API returned invalid response: {run}"

        print(f"  ‚è≥ Run ID: {run['id']}")

        run_client = client.run(run["id"])
        run_info = run_client.wait_for_finish()

        status = run_info.get('status', 'UNKNOWN')
        print(f"  üìä Status: {status}")

        if status in ['FAILED', 'TIMED-OUT', 'ABORTED']:
            error_detail = run_info.get('statusMessage', 'No error details')
            return [], f"Actor run {status}: {error_detail}"

        if status == "SUCCEEDED" and "defaultDatasetId" in run:
            dataset = client.dataset(run["defaultDatasetId"])
            items = list(dataset.iterate_items())
            print(f"  ‚úÖ Retrieved {len(items)} results")
            return items, None

        return [], f"Scraping failed with status: {status}"

    except Exception as e:
        error_msg = f"Error during scraping: {type(e).__name__}: {str(e)}"
        print(f"  ‚ùå {error_msg}")
        return [], error_msg


# Execute scraper
print("="*70)
print("üåê WEBSITE PHONE NUMBER SCRAPER - BATCH OPTIMIZED")
print("="*70)
print(f"üìä Configuration:")
print(f"   ‚Ä¢ Batch size: {BATCH_SIZE} websites")
print(f"   ‚Ä¢ Concurrency: {MAX_CONCURRENCY} browsers")
print(f"   ‚Ä¢ Browser: Chromium")
print(f"   ‚Ä¢ Proxy: RESIDENTIAL")
print(f"   ‚Ä¢ Strategy: Try contact page first, fall back to homepage")
print("="*70)

# Validate API token
print(f"\nüîë Validating Apify API token...")
try:
    user_info = client.user().get()
    print(f"‚úÖ API Key valid - User: {user_info.get('username', 'Unknown')}")
    print(f"   ‚Ä¢ Plan: {user_info.get('plan', {}).get('id', 'Unknown')}")
    print(f"   ‚Ä¢ Credits remaining: Check your dashboard at https://console.apify.com/billing")
except Exception as e:
    print(f"‚ùå API Token Error: {e}")
    print(f"   ‚Ä¢ Check your token at: https://console.apify.com/account/integrations")
    print(f"   ‚Ä¢ Current token starts with: {APIFY_TOKEN[:15]}...")
    raise

# Use RecordOwl_Leads dataframe - filter for rows with valid Website column
print(f"\nüìã Total rows in RecordOwl_Leads: {len(RecordOwl_Leads)}")

# Filter for rows with non-null and non-empty websites
websites_to_scrape = RecordOwl_Leads[
    RecordOwl_Leads["Website"].notna() & 
    (RecordOwl_Leads["Website"] != "") &
    (RecordOwl_Leads["Website"] != "None")
].copy()

print(f"üìã Rows with valid websites: {len(websites_to_scrape)}")

all_results = []
total_rows = len(websites_to_scrape)
total_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE

for batch_idx in range(0, total_rows, BATCH_SIZE):
    batch = websites_to_scrape.iloc[batch_idx:batch_idx + BATCH_SIZE]
    websites = [str(row['Website']).strip() for _, row in batch.iterrows()]

    batch_num = (batch_idx//BATCH_SIZE)+1
    print(f"\n{'‚îÄ'*70}")
    print(f"üì¶ Batch {batch_num}/{total_batches} - Processing {len(websites)} websites")

    items, error = run_website_scraper(client, websites)

    if error:
        print(f"  ‚ùå Batch error: {error}")
        for website in websites:
            all_results.append({
                "Website": website,
                "Website_Scrape_Status": "error",
                "Website_Scrape_Error": error,
                "Website_Phones": None,
                "Website_Contact_Page": None,
                "Website_Page_Type": None
            })
        continue

    # Map results by website
    website_map = {}
    for item in items:
        if item and item.get('website'):
            # Store the result with most phones
            web = item['website']
            if web not in website_map or (item.get('phones') and len(item.get('phones', [])) > len(website_map[web].get('phones', []))):
                website_map[web] = item

    for website in websites:
        item = website_map.get(website)
        if not item:
            print(f"    ‚ö†Ô∏è  {website}: Not found in results")
            all_results.append({
                "Website": website,
                "Website_Scrape_Status": "missing",
                "Website_Scrape_Error": "No data returned",
                "Website_Phones": None,
                "Website_Contact_Page": None,
                "Website_Page_Type": None
            })
        else:
            status = item.get('status', 'error')
            phones = item.get('phones', None)
            page_type = item.get('pageType', 'unknown')
            phone_count = len(phones) if phones else 0

            if status == 'success' and phones:
                print(f"    ‚úÖ {website}: {phone_count} phone(s) from {page_type}")
            elif status == 'success':
                print(f"    ‚ö†Ô∏è  {website}: No phones found on {page_type}")
            else:
                print(f"    ‚ùå {website}: {status} - {item.get('error', 'Unknown')}")

            all_results.append({
                'Website': website,
                'Website_Scrape_Status': status,
                'Website_Scrape_Error': item.get('error'),
                'Website_Phones': phones,
                'Website_Contact_Page': item.get('contactUrl'),
                'Website_Page_Type': page_type
            })

    # Sleep between batches
    if batch_num < total_batches:
        time.sleep(2)

# Create results DataFrame
Website_Scraped_Results = pd.DataFrame(all_results)

# Merge with original RecordOwl_Leads to preserve all original columns
RecordOwl_Leads_Enriched = RecordOwl_Leads.merge(
    Website_Scraped_Results, 
    on='Website', 
    how='left'
)

üåê WEBSITE PHONE NUMBER SCRAPER - BATCH OPTIMIZED
üìä Configuration:
   ‚Ä¢ Batch size: 50 websites
   ‚Ä¢ Concurrency: 3 browsers
   ‚Ä¢ Browser: Chromium
   ‚Ä¢ Proxy: RESIDENTIAL
   ‚Ä¢ Strategy: Try contact page first, fall back to homepage

üîë Validating Apify API token...
‚úÖ API Key valid - User: SALESOPS_EPOS
   ‚Ä¢ Plan: STARTER
   ‚Ä¢ Credits remaining: Check your dashboard at https://console.apify.com/billing

üìã Total rows in RecordOwl_Leads: 55
üìã Rows with valid websites: 55

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üì¶ Batch 1/2 - Processing 50 websites
  üìã Processing 50 websites in single actor run
  üöÄ Launching Apify actor with 3 concurrent browsers...


[36m[apify.puppeteer-scraper runId:7M1ci4WqacngYIQDh][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:7M1ci4WqacngYIQDh][0m -> 2025-11-24T01:47:07.936Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:7M1ci4WqacngYIQDh][0m -> 2025-11-24T01:47:07.939Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:7M1ci4WqacngYIQDh][0m -> 2025-11-24T01:47:07.992Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:7M1ci4WqacngYIQDh][0m -> 2025-11-24T01:47:08.306Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:7M1ci4WqacngYIQDh][0m -> 2025-11-24T01:47:09.026Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:7M1

  ‚ùå Error during scraping: JSONDecodeError: Expecting value: line 1 column 1 (char 0)
  ‚ùå Batch error: Error during scraping: JSONDecodeError: Expecting value: line 1 column 1 (char 0)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üì¶ Batch 2/2 - Processing 5 websites
  üìã Processing 5 websites in single actor run
  üöÄ Launching Apify actor with 3 concurrent browsers...


[36m[apify.puppeteer-scraper runId:iKTZpZLpjG8vMWkcz][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:iKTZpZLpjG8vMWkcz][0m -> 2025-11-24T02:01:17.781Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:iKTZpZLpjG8vMWkcz][0m -> 2025-11-24T02:01:17.783Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:iKTZpZLpjG8vMWkcz][0m -> 2025-11-24T02:01:18.162Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:iKTZpZLpjG8vMWkcz][0m -> 2025-11-24T02:01:18.341Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:iKTZpZLpjG8vMWkcz][0m -> 2025-11-24T02:01:18.945Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:iKT

  ‚è≥ Run ID: iKTZpZLpjG8vMWkcz
  üìä Status: SUCCEEDED
  ‚úÖ Retrieved 8 results
    ‚ö†Ô∏è  https://rtac-consulting-engineering.com/: No phones found on contact page
    ‚ö†Ô∏è  https://freshautocare.co.uk/: No phones found on homepage
    ‚úÖ https://www.trans-tec.com/: 2 phone(s) from contact page
    ‚ö†Ô∏è  https://masev-designs.com/contactus.html: Not found in results
    ‚ö†Ô∏è  https://detailogy.co.uk/: No phones found on contact page


In [66]:
print(f"\n{'='*70}")
print(f"‚úÖ SCRAPING COMPLETE")
print(f"{'='*70}")
print(f"üìä Results Summary:")
print(f" Total processed: {len(Website_Scraped_Results)}")
print(f"  Successful: {(Website_Scraped_Results['Website_Scrape_Status']=='success').sum()}")
print(f"  Failed: {(Website_Scraped_Results['Website_Scrape_Status']=='error').sum()}")
print(f" Missing: {(Website_Scraped_Results['Website_Scrape_Status']=='missing').sum()}")
print(f"\nüìû Data Extracted:")
print(f" Website phones found: {Website_Scraped_Results['Website_Phones'].notna().sum()}")
print(f"\nüìÑ Page Types Scraped:")
print(f" Contact pages: {(Website_Scraped_Results['Website_Page_Type']=='contact page').sum()}")
print(f" Homepages: {(Website_Scraped_Results['Website_Page_Type']=='homepage').sum()}")
print(f"{'='*70}")

print(f"\nüìä Final Enriched DataFrame:")
print(f"   ‚Ä¢ Total rows: {len(RecordOwl_Leads_Enriched)}")
print(f"   ‚Ä¢ Columns: {list(RecordOwl_Leads_Enriched.columns)}")


‚úÖ SCRAPING COMPLETE
üìä Results Summary:
 Total processed: 55
  Successful: 4
  Failed: 50
 Missing: 1

üìû Data Extracted:
 Website phones found: 1

üìÑ Page Types Scraped:
 Contact pages: 3
 Homepages: 1

üìä Final Enriched DataFrame:
   ‚Ä¢ Total rows: 65
   ‚Ä¢ Columns: ['UEN', 'Status', 'Error', 'Emails', 'Phones', 'Website', 'Facebook', 'LinkedIn', 'Instagram', 'TikTok', 'RecordOwl_Link', 'operational_street', 'operational_unit', 'operational_postal_code', 'operational_address', 'Phones_str', 'Website_Scrape_Status', 'Website_Scrape_Error', 'Website_Phones', 'Website_Contact_Page', 'Website_Page_Type']


In [None]:
RecordOwl_Leads

In [None]:
RecordOwl_Leads.to_parquet("./Staging/Silver/carmotor_proceesd.parquet", index=False, engine="fastparquet")

In [None]:
sg_company_sites.shape