### Data Mining Facebook (Silver 2)

In [1]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


### Ingest Data

In [2]:
parquet_path = "./Staging/Gold/cleaned_second_592.parquet"
if os.path.exists(parquet_path):
    RecordOwl_Leads = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(RecordOwl_Leads)} rows from {parquet_path}")
    print(RecordOwl_Leads.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")


filtered_df = RecordOwl_Leads[
    (RecordOwl_Leads["Facebook Page"].notna()) & 
    (RecordOwl_Leads["Facebook Page"] != "") &
    (RecordOwl_Leads["PIC NAME 1 Contact Number"].isna() | 
     (RecordOwl_Leads["PIC NAME 1 Contact Number"] == ""))
].copy()

# filtered_df = filtered_df[["PIC NAME 1 Contact Number", "Facebook Page"]]
facebook_only_df = filtered_df.copy() 
facebook_only_df.shape

Loaded 592 rows from ./Staging/Gold/cleaned_second_592.parquet
(592, 62)


(26, 62)

In [3]:
# Initialize the ApifyClient with your API token
client = ApifyClient("apify_api_pO2bb6oghhfOQ4af2GXFreV6pKNcNF2jVPwF")

# CONFIG - Optimized batch settings to MINIMIZE API credit usage
# Cost breakdown (FREE tier): $0.006 per actor start + $0.013 per result
# Batch approach saves ~24% on actor start costs vs individual runs
BATCH_SIZE = 10  # Process 10 Facebook pages per batch (increased from 5)
MAX_CONCURRENCY = 3  # Scrape 3 pages simultaneously (increased for efficiency)

# Enhanced function to validate Singapore phone numbers with multiple patterns
def validate_singapore_number(phone):
    """
    Validates and standardizes Singapore phone numbers.
    Only accepts valid SG mobile/landline numbers (8 digits starting with 6, 8, or 9).
    Returns standardized format: +65XXXXXXXX
    """
    if not phone:
        return None

    # Remove all spaces, dashes, parentheses, dots, and common separators
    cleaned = re.sub(r'[\s\-\(\)\.\|/]', '', str(phone))

    # Remove leading + if present
    if cleaned.startswith('+'):
        cleaned = cleaned[1:]

    # Pattern 1: With country code 65
    if cleaned.startswith('65'):
        # Extract the number after country code
        number_part = cleaned[2:]
        # Validate: must be exactly 8 digits starting with 6, 8, or 9
        if re.match(r'^[689]\d{7}$', number_part):
            return f"+65{number_part}"  # Return standardized format

    # Pattern 2: Just 8 digits without country code
    elif re.match(r'^[689]\d{7}$', cleaned):
        return f"+65{cleaned}"  # Add country code and return

    # Not a valid Singapore number
    return None


def run_facebook_scraper_batch(client, facebook_urls_batch):
    """
    Run Apify Facebook scraper for a batch of URLs with concurrency.
    The scraper extracts ONLY publicly visible data from the Facebook page's "About" section.
    This includes: phone, email, website, address (no hidden HTML/JS backend data).
    
    Returns: (results_list, error_message)
    """
    
    print(f"\n  üìã Batch URLs ({len(facebook_urls_batch)} pages):")
    for i, url in enumerate(facebook_urls_batch, 1):
        # Truncate long URLs for display
        display_url = url if len(url) <= 60 else url[:57] + "..."
        print(f"      {i}. {display_url}")
    
    # Prepare the Actor input with multiple pages
    # The actor processes all pages and extracts visible contact info from their About sections
    run_input = {
        "pages": facebook_urls_batch,
        "language": "en-US",
    }

    print(f"\n  üöÄ Launching Apify actor (batch of {len(facebook_urls_batch)}, concurrency: {MAX_CONCURRENCY})...")
    print(f"  üí∞ Estimated cost: ${0.006 + (len(facebook_urls_batch) * 0.013):.4f} USD")
    
    try:
        # Run the Actor and wait for it to finish
        # This scraper only accesses the public "About" section of each Facebook page
        run = client.actor("oJ48ceKNY7ueGPGL0").call(run_input=run_input)
        
        print(f"  ‚è≥ Run ID: {run['id']}")
        print(f"  üìä Status: {run.get('status', 'RUNNING')}")
        
        # Collect all results from the dataset
        results = []
        for item in client.dataset(run["defaultDatasetId"]).iterate_items():
            results.append(item)
        
        print(f"  ‚úÖ Retrieved {len(results)}/{len(facebook_urls_batch)} results from batch")
        return results, None
        
    except Exception as e:
        print(f"  ‚ùå Batch error: {str(e)}")
        return [], str(e)


# Process all rows in facebook_only_df using optimized batching
if len(facebook_only_df) > 0:
    print("=" * 80)
    print(f"üéØ FACEBOOK SCRAPER - Optimized Batch Processing")
    print("=" * 80)
    print(f"üìä Total Facebook pages: {len(facebook_only_df)}")
    print(f"üì¶ Batch size: {BATCH_SIZE} pages per batch")
    print(f"‚ö° Concurrency: {MAX_CONCURRENCY} pages scraped simultaneously per batch")
    print(f"üí∞ Estimated total cost: ${(len(facebook_only_df) // BATCH_SIZE + 1) * 0.006 + len(facebook_only_df) * 0.013:.4f} USD")
    print(f"üìù Note: Scraper extracts ONLY publicly visible data from 'About' section")
    print("=" * 80)
    
    # Track overall results
    total_successful = 0
    total_failed = 0
    total_phones_found = 0
    total_cost_estimate = 0.0
    
    # Process in batches
    total_rows = len(facebook_only_df)
    num_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
    
    for batch_idx in range(0, total_rows, BATCH_SIZE):
        batch = facebook_only_df.iloc[batch_idx:batch_idx + BATCH_SIZE]
        facebook_urls = [str(row['Facebook Page']).strip() for _, row in batch.iterrows()]
        batch_indices = list(batch.index)
        
        print(f"\n{'‚îÄ'*80}")
        print(f"üì¶ Batch {(batch_idx//BATCH_SIZE)+1}/{num_batches}")
        
        # Run scraper for this batch
        items, error = run_facebook_scraper_batch(client, facebook_urls)
        
        # Track cost
        batch_cost = 0.006 + (len(facebook_urls) * 0.013)
        total_cost_estimate += batch_cost
        
        if error:
            print(f"  ‚ö†Ô∏è Batch failed, marking all {len(batch_indices)} pages as None")
            for idx in batch_indices:
                facebook_only_df.loc[idx, 'PIC NAME 1 Contact Number'] = None
            total_failed += len(batch_indices)
            continue
        
        # Map results back to dataframe by Facebook URL
        url_to_item = {}
        for item in items:
            # Try multiple fields for Facebook URL
            fb_url = item.get('facebookUrl') or item.get('url') or item.get('pageUrl')
            if fb_url:
                # Normalize URLs for matching (lowercase, strip, remove trailing slash)
                normalized_url = fb_url.lower().strip().rstrip('/')
                url_to_item[normalized_url] = item
        
        # Update each row in the batch
        print(f"\n  üìã Processing batch results:")
        for idx, row in batch.iterrows():
            original_url = str(row['Facebook Page']).strip()
            normalized_search = original_url.lower().strip().rstrip('/')
            
            # Try to find matching item
            item = url_to_item.get(normalized_search)
            
            if item:
                # Extract phone from multiple possible fields returned by scraper
                raw_phone = item.get('phone', None) or item.get('wa_number', None) or item.get('mobile', None)
                
                # Validate it's a Singapore number (visible from About section)
                phone = validate_singapore_number(raw_phone)
                
                if phone:
                    facebook_only_df.loc[idx, 'PIC NAME 1 Contact Number'] = phone
                    print(f"    ‚úÖ {original_url[:50]}... ‚Üí {phone}")
                    total_phones_found += 1
                    total_successful += 1
                else:
                    facebook_only_df.loc[idx, 'PIC NAME 1 Contact Number'] = None
                    print(f"    ‚ö†Ô∏è {original_url[:50]}... ‚Üí No valid SG number found")
                    total_successful += 1
            else:
                facebook_only_df.loc[idx, 'PIC NAME 1 Contact Number'] = None
                print(f"    ‚úó {original_url[:50]}... ‚Üí Not found in results")
                total_failed += 1
        
        # Delay between batches to be respectful to API and avoid rate limiting
        if batch_idx + BATCH_SIZE < total_rows:
            print(f"\n  ‚è≥ Waiting 3 seconds before next batch...")
            time.sleep(3)
    
    # Print final summary
    print("\n" + "=" * 80)
    print("üìä FINAL SCRAPING SUMMARY:")
    print("=" * 80)
    print(f"  Total pages processed: {total_rows}")
    print(f"  Successful scrapes: {total_successful}")
    print(f"  Failed scrapes: {total_failed}")
    print(f"  Valid phone numbers found: {total_phones_found}")
    print(f"  Success rate: {(total_successful/total_rows)*100:.1f}%")
    print(f"  Phone found rate: {(total_phones_found/total_rows)*100:.1f}%")
    print(f"  üí∞ Estimated cost: ${total_cost_estimate:.4f} USD")
    print(f"  üìä Cost per phone found: ${total_cost_estimate/total_phones_found:.4f} USD" if total_phones_found > 0 else "  üìä Cost per phone: N/A")
    print("=" * 80)
    
    # Display the updated dataframe
    display(facebook_only_df)
    
else:
    print("No data available in facebook_only_df")

üéØ FACEBOOK SCRAPER - Optimized Batch Processing
üìä Total Facebook pages: 26
üì¶ Batch size: 10 pages per batch
‚ö° Concurrency: 3 pages scraped simultaneously per batch
üí∞ Estimated total cost: $0.3560 USD
üìù Note: Scraper extracts ONLY publicly visible data from 'About' section

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üì¶ Batch 1/3

  üìã Batch URLs (10 pages):
      1. https://www.facebook.com/theinternationalschooling/
      2. https://www.facebook.com/people/WeWin-Education-Centre/10...
      3. https://www.facebook.com/JeducationTravel/
      4. https://www.facebook.com/BLOCK71Singapore/photos/turning-...
      5. https://www.facebook.com/lifessimpletreasuresdtw/
      6. https://www.facebook.com/ioptechnologies
      7. https://www.facebook.com/upwardexim/
      8. https

[36m[apify.facebook-page-contact-information runId:R7LpD95BIk4NkQz7K][0m -> Status: RUNNING, Message: 
[36m[apify.facebook-page-contact-information runId:R7LpD95BIk4NkQz7K][0m -> 2025-11-19T08:25:29.008Z ACTOR: Pulling container image of build ZbEBvgTRDUl564DQQ from registry.
[36m[apify.facebook-page-contact-information runId:R7LpD95BIk4NkQz7K][0m -> 2025-11-19T08:25:29.009Z ACTOR: Creating container.
[36m[apify.facebook-page-contact-information runId:R7LpD95BIk4NkQz7K][0m -> 2025-11-19T08:25:29.185Z ACTOR: Starting container.
[36m[apify.facebook-page-contact-information runId:R7LpD95BIk4NkQz7K][0m -> 2025-11-19T08:25:30.430Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.15.2","osType":"Linux","nodeVersion":"v20.19.5"}[39m
[36m[apify.facebook-page-contact-information runId:R7LpD95BIk4NkQz7K][0m -> 2025-11-19T08:25:30.603Z [32mINFO[39m  Results Limit undefined
[36m[apify.facebook-page-contact-information runId:

  ‚è≥ Run ID: R7LpD95BIk4NkQz7K
  üìä Status: SUCCEEDED
  ‚úÖ Retrieved 10/10 results from batch

  üìã Processing batch results:
    ‚ö†Ô∏è https://www.facebook.com/theinternationalschooling... ‚Üí No valid SG number found
    ‚úÖ https://www.facebook.com/people/WeWin-Education-Ce... ‚Üí +6597808378
    ‚ö†Ô∏è https://www.facebook.com/JeducationTravel/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/BLOCK71Singapore/photos/t... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/lifessimpletreasuresdtw/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/ioptechnologies... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/upwardexim/... ‚Üí No valid SG number found
    ‚úÖ https://www.facebook.com/think.kids.singapore... ‚Üí +6567354438
    ‚ö†Ô∏è https://www.facebook.com/atlantisprep/... ‚Üí No valid SG number found
    ‚úÖ https://www.facebook.com/simpleplangroup/... ‚Üí +6564441562

  ‚è≥ Waiting 3 seconds before nex

[36m[apify.facebook-page-contact-information runId:fcmoHxRLyJpOA321C][0m -> Status: RUNNING, Message: 
[36m[apify.facebook-page-contact-information runId:fcmoHxRLyJpOA321C][0m -> 2025-11-19T08:26:43.489Z ACTOR: Pulling container image of build ZbEBvgTRDUl564DQQ from registry.
[36m[apify.facebook-page-contact-information runId:fcmoHxRLyJpOA321C][0m -> 2025-11-19T08:26:43.491Z ACTOR: Creating container.
[36m[apify.facebook-page-contact-information runId:fcmoHxRLyJpOA321C][0m -> 2025-11-19T08:26:43.531Z ACTOR: Starting container.
[36m[apify.facebook-page-contact-information runId:fcmoHxRLyJpOA321C][0m -> 2025-11-19T08:26:44.704Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.15.2","osType":"Linux","nodeVersion":"v20.19.5"}[39m
[36m[apify.facebook-page-contact-information runId:fcmoHxRLyJpOA321C][0m -> 2025-11-19T08:26:44.832Z [32mINFO[39m  Results Limit undefined
[36m[apify.facebook-page-contact-information runId:

  ‚è≥ Run ID: fcmoHxRLyJpOA321C
  üìä Status: SUCCEEDED
  ‚úÖ Retrieved 10/10 results from batch

  üìã Processing batch results:
    ‚ö†Ô∏è https://www.facebook.com/achievia.educations2/... ‚Üí No valid SG number found
    ‚úÖ https://www.facebook.com/p/Glowing-Firefly-Consult... ‚Üí +6583235086
    ‚ö†Ô∏è https://www.facebook.com/photo.php?fbid=1221303762... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/people/aceitnow-home-tuit... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/BrilliantTOTSSG/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/profile.php?id=6156644989... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/alpha.education.and.sport... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/tesintl/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/kursusmakeup/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/gracelanguagecenter/... ‚Üí No valid SG numb

[36m[apify.facebook-page-contact-information runId:t71wxfKzNIzXtWzxP][0m -> Status: RUNNING, Message: 
[36m[apify.facebook-page-contact-information runId:t71wxfKzNIzXtWzxP][0m -> 2025-11-19T08:27:46.129Z ACTOR: Pulling container image of build ZbEBvgTRDUl564DQQ from registry.
[36m[apify.facebook-page-contact-information runId:t71wxfKzNIzXtWzxP][0m -> 2025-11-19T08:27:46.132Z ACTOR: Creating container.
[36m[apify.facebook-page-contact-information runId:t71wxfKzNIzXtWzxP][0m -> 2025-11-19T08:27:46.211Z ACTOR: Starting container.
[36m[apify.facebook-page-contact-information runId:t71wxfKzNIzXtWzxP][0m -> 2025-11-19T08:27:47.223Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.15.2","osType":"Linux","nodeVersion":"v20.19.5"}[39m
[36m[apify.facebook-page-contact-information runId:t71wxfKzNIzXtWzxP][0m -> 2025-11-19T08:27:47.365Z [32mINFO[39m  Results Limit undefined
[36m[apify.facebook-page-contact-information runId:

  ‚è≥ Run ID: t71wxfKzNIzXtWzxP
  üìä Status: SUCCEEDED
  ‚úÖ Retrieved 6/6 results from batch

  üìã Processing batch results:
    ‚ö†Ô∏è https://www.facebook.com/littleartlab.org/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/rochelle.hope.5/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/praxiumsg... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/groups/allinoneeducation/... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/p/Arexevan-Racing-1000720... ‚Üí No valid SG number found
    ‚ö†Ô∏è https://www.facebook.com/belvedereint/... ‚Üí No valid SG number found

üìä FINAL SCRAPING SUMMARY:
  Total pages processed: 26
  Successful scrapes: 26
  Failed scrapes: 0
  Valid phone numbers found: 4
  Success rate: 100.0%
  Phone found rate: 15.4%
  üí∞ Estimated cost: $0.3560 USD
  üìä Cost per phone found: $0.0890 USD


Unnamed: 0,ePOS Code,Company Code,Date,ACRA REGISTERED NAME,Brand/Deal Name/Business Name,Sub Domain Link (If Lead is already available in Backend) Fill only when EPOS client,Tele Sales or MR (For KPI - Internal),Name of the Market Researcher,Original Source (Marketing),"Marketing Source (Do not fill anything if the leads are from Hubspot, EPOS clients)",...,PIC Name Designation 3,PIC NAME 3 Contact Number,PIC 3 email address,FB/Insta/Tik Tok/Linkedin Contact,Current ePOS Client ?,"If ePOS Client, which product they are using?",Is this deal part of the Gov List?,Source from Market Researcher,Contact Number from Lusha?,Phone number Verified ?
285,,,2025-11-18,INTERNATIONAL SCHOOLING PTE. LTD.,SCHOOLING,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
295,,,2025-11-18,WEWIN EDUCATION CENTRE PTE. LTD.,WEWIN EDUCATION CENTRE,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
318,,,2025-11-18,J EDUCATION,J EDUCATION,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
322,,,2025-11-18,EDUVILLE PTE. LTD.,EDUVILLE,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
328,,,2025-11-18,AVENUE MENTOR,AVENUE MENTOR,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
333,,,2025-11-18,IOP LLP,IOP,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
356,,,2025-11-18,UPWARD LLP,UPWARD,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
369,,,2025-11-18,WAN COUNSELLING SERVICES,WAN COUNSELLING,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
385,,,2025-11-18,ATLANTIC PREP,ATLANTIC PREP,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
393,,,2025-11-18,SIMPLE PLAN CENTRE,SIMPLE PLAN CENTRE,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,


In [4]:
# Merge scraped phone numbers back into the original RecordOwl_Leads dataframe
print("=" * 80)
print("üîÑ MERGING SCRAPED PHONE NUMBERS INTO RECORDOWL_LEADS")
print("=" * 80)

# Store original shape
original_shape = RecordOwl_Leads.shape
original_phones = RecordOwl_Leads['PIC NAME 1 Contact Number'].notna().sum()

print(f"\nüìä Before merge:")
print(f"  Total rows in RecordOwl_Leads: {len(RecordOwl_Leads)}")
print(f"  Rows with phone numbers: {original_phones}")
print(f"  Rows without phone numbers: {len(RecordOwl_Leads) - original_phones}")

# Update RecordOwl_Leads with the scraped phone numbers from facebook_only_df
# Using the index to match rows (since facebook_only_df is a filtered copy of RecordOwl_Leads)
updates_made = 0
phones_added = 0

for idx in facebook_only_df.index:
    scraped_phone = facebook_only_df.loc[idx, 'PIC NAME 1 Contact Number']
    
    # Only update if we found a valid phone number from scraping
    if pd.notna(scraped_phone) and scraped_phone != "":
        # Update the original dataframe
        RecordOwl_Leads.loc[idx, 'PIC NAME 1 Contact Number'] = scraped_phone
        updates_made += 1
        phones_added += 1

# Final statistics
final_phones = RecordOwl_Leads['PIC NAME 1 Contact Number'].notna().sum()
final_shape = RecordOwl_Leads.shape

print(f"\nüìä After merge:")
print(f"  Total rows in RecordOwl_Leads: {len(RecordOwl_Leads)}")
print(f"  Rows with phone numbers: {final_phones}")
print(f"  Rows without phone numbers: {len(RecordOwl_Leads) - final_phones}")
print(f"\n‚úÖ Updates made: {updates_made} rows")
print(f"üìû New phone numbers added: {phones_added}")
print(f"üìà Phone number increase: {final_phones - original_phones}")

# Verify no duplicates
print(f"\nüîç Duplicate check:")
print(f"  Shape remained same: {original_shape == final_shape}")
print(f"  No duplicate rows: {len(RecordOwl_Leads) == len(RecordOwl_Leads.drop_duplicates())}")

print("=" * 80)

# Display sample of updated rows
if updates_made > 0:
    print(f"\nüìã Sample of updated rows (showing first 5):")
    updated_indices = facebook_only_df[facebook_only_df['PIC NAME 1 Contact Number'].notna()].index[:5]
    display(RecordOwl_Leads.loc[updated_indices, ['Facebook Page', 'PIC NAME 1 Contact Number']])

RecordOwl_Leads.head()

üîÑ MERGING SCRAPED PHONE NUMBERS INTO RECORDOWL_LEADS

üìä Before merge:
  Total rows in RecordOwl_Leads: 592
  Rows with phone numbers: 277
  Rows without phone numbers: 315

üìä After merge:
  Total rows in RecordOwl_Leads: 592
  Rows with phone numbers: 281
  Rows without phone numbers: 311

‚úÖ Updates made: 4 rows
üìû New phone numbers added: 4
üìà Phone number increase: 4

üîç Duplicate check:
  Shape remained same: True
  No duplicate rows: True

üìã Sample of updated rows (showing first 5):


Unnamed: 0,Facebook Page,PIC NAME 1 Contact Number
295,https://www.facebook.com/people/WeWin-Educatio...,6597808378
369,https://www.facebook.com/think.kids.singapore,6567354438
393,https://www.facebook.com/simpleplangroup/,6564441562
408,https://www.facebook.com/p/Glowing-Firefly-Con...,6583235086


Unnamed: 0,ePOS Code,Company Code,Date,ACRA REGISTERED NAME,Brand/Deal Name/Business Name,Sub Domain Link (If Lead is already available in Backend) Fill only when EPOS client,Tele Sales or MR (For KPI - Internal),Name of the Market Researcher,Original Source (Marketing),"Marketing Source (Do not fill anything if the leads are from Hubspot, EPOS clients)",...,PIC Name Designation 3,PIC NAME 3 Contact Number,PIC 3 email address,FB/Insta/Tik Tok/Linkedin Contact,Current ePOS Client ?,"If ePOS Client, which product they are using?",Is this deal part of the Gov List?,Source from Market Researcher,Contact Number from Lusha?,Phone number Verified ?
0,,,2025-11-18,K-JOY EDUCARE PTE. LTD.,K-JOY EDUCARE,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
1,,,2025-11-18,TUNGSTAN EDLAB,TUNGSTAN EDLAB,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
2,,,2025-11-18,ADVENTURE TREE PRESCHOOL BRADDELL PTE. LTD.,ADVENTURE TREE PRESCHOOL BRADDELL,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
3,,,2025-11-18,GLOBAL ACHIEVERS EDUCATION CENTRE PTE. LTD.,ACHIEVERS EDUCATION CENTRE,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,
4,,,2025-11-18,SG NOAH'S ARK EDUTECH PTE. LTD.,NOAHS ARK EDUTECH,,TeleSales,Shafiqah,Offline Sources,Web Scrapping,...,,,,,,,Gov List,"ACRA, Google Searches",,


In [None]:
RecordOwl_Leads['Contact Number from Lusha?'] = 'No'



In [8]:
RecordOwl_Leads['Current ePOS Client ?'] = 'No'
RecordOwl_Leads.to_csv("./Fresh_Leads_19_11_25.csv")