### Data Mining Facebook (Silver 2)

In [18]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re
from apify_client import ApifyClient


### Ingest Data

In [19]:
import os
import pandas as pd

# --- Load Parquet ---
parquet_path = "./Staging/Silver/carmotor_facebook_only.parquet"
if os.path.exists(parquet_path):
    facebook_only_df = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(facebook_only_df)} rows from {parquet_path}")
    print(facebook_only_df.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

# --- Filter rows where Facebook is not None/NaN ---
facebook_only_df = facebook_only_df[facebook_only_df["Facebook"].notna()]

# --- Clean the existing Facebook column in-place ---
def clean_facebook(x):
    if isinstance(x, list):                  # if it's a list
        return x[0] if len(x) > 0 else None  # take first URL or None
    elif pd.isna(x):                          # if NaN
        return None
    else:                                     # already a string
        return x

facebook_only_df["Facebook"] = facebook_only_df["Facebook"].apply(clean_facebook)

# --- Remove empty or None values after cleaning ---
facebook_only_df = facebook_only_df[facebook_only_df["Facebook"].notna() & (facebook_only_df["Facebook"] != "")]

# --- Inspect cleaned data ---
facebook_only_df.head()


Loaded 47 rows from ./Staging/Silver/carmotor_facebook_only.parquet
(47, 16)


Unnamed: 0,UEN,Status,Error,Emails,Phones,Website,Facebook,LinkedIn,Instagram,TikTok,RecordOwl_Link,operational_street,operational_unit,operational_postal_code,operational_address,Phones_str
0,202503911K,success,,,,,https://www.facebook.com/thedetailingcoltd/,,[https://www.instagram.com/detailingco.pk/],,https://recordowl.com/company/the-detailingco-...,61 WOODLANDS INDUSTRIAL PARK E9 E9 PREMIUM,01-20,757047,61 WOODLANDS INDUSTRIAL PARK E9 E9 PREMIUM 01-...,
1,202412050W,success,,,,https://www.classicmotorworks.com.sg/?utm_sour...,https://www.facebook.com/classicmotorworkssg,,[https://www.instagram.com/classicmotorworkssg/],,https://recordowl.com/company/jinson-garage-pt...,60 JALAN LAM HUAT CARROS CENTRE,05-58,737869,60 JALAN LAM HUAT CARROS CENTRE 05-58 Singapor...,
2,53459382L,success,,,,,https://m.facebook.com/100090788316126,,,,https://recordowl.com/company/vegas-motor-works,9002 TAMPINES STREET 93 TAMPINES INDUSTRIAL PA...,01-40,528836,9002 TAMPINES STREET 93 TAMPINES INDUSTRIAL PA...,
3,53471316E,success,,[admin@xgarage.com.my],,https://xgarage.com.my/,https://www.facebook.com/xgarage.xgarage,,[https://www.instagram.com/x.garage_/],,https://recordowl.com/company/x-garage,15 KAKI BUKIT ROAD 4 BARTLEY BIZ CENTRE,01-53,417808,15 KAKI BUKIT ROAD 4 BARTLEY BIZ CENTRE 01-53 ...,
4,53427200B,success,,[3hautotrading@gmail.com],,,https://www.facebook.com/3HAutoTrading/,,,,https://recordowl.com/company/3h-auto-trading,192 PANDAN LOOP PANTECH BUSINESS HUB,07-23,128381,192 PANDAN LOOP PANTECH BUSINESS HUB 07-23 Sin...,


In [None]:
# parquet_path = "./Staging/Gold/cleaned_second_592.parquet"
# if os.path.exists(parquet_path):
#     RecordOwl_Leads = pd.read_parquet(parquet_path, engine="fastparquet")
#     print(f"Loaded {len(RecordOwl_Leads)} rows from {parquet_path}")
#     print(RecordOwl_Leads.shape)
# else:
#     raise FileNotFoundError(f"Parquet file not found at {parquet_path}")


# filtered_df = RecordOwl_Leads[
#     (RecordOwl_Leads["Facebook Page"].notna()) & 
#     (RecordOwl_Leads["Facebook Page"] != "") &
#     (RecordOwl_Leads["PIC NAME 1 Contact Number"].isna() | 
#      (RecordOwl_Leads["PIC NAME 1 Contact Number"] == ""))
# ].copy()

# # filtered_df = filtered_df[["PIC NAME 1 Contact Number", "Facebook Page"]]
# facebook_only_df = filtered_df.copy() 
# facebook_only_df.shape

In [None]:
# Initialize the ApifyClient with your API token
client = ApifyClient("apify_api_gak2ulhepgd4uzBseSLQtiHnb9KGxy3iMwp2")

BATCH_SIZE = 100  # Process 50 Facebook pages per batch (5x increase from 10)
MAX_CONCURRENCY = 3  # Scrape 3 pages simultaneously for stable performance

def validate_singapore_number(phone):
    """Validates and standardizes Singapore phone numbers."""
    if not phone:
        return None
    cleaned = re.sub(r'[\s\-\(\)\.\|/]', '', str(phone))
    if cleaned.startswith('+'):
        cleaned = cleaned[1:]
    if cleaned.startswith('65'):
        number_part = cleaned[2:]
        if re.match(r'^[689]\d{7}$', number_part):
            return f"+65{number_part}"
    elif re.match(r'^[689]\d{7}$', cleaned):
        return f"+65{cleaned}"
    return None


def run_facebook_scraper_batch(client, facebook_urls_batch):
    """Run Apify Facebook scraper for a batch of URLs."""
    
    print(f"\n  Batch URLs ({len(facebook_urls_batch)} pages):")
    for i, url in enumerate(facebook_urls_batch, 1):
        display_url = url if len(url) <= 60 else url[:57] + "..."
        print(f"      {i}. {display_url}")
    
    # Exact format from working reference code
    run_input = {
        "pages": facebook_urls_batch,
        "language": "en-US",
    }

    print(f"\n  Launching Apify actor (batch of {len(facebook_urls_batch)}, concurrency: {MAX_CONCURRENCY})...")
    print(f"  Estimated cost: ${0.006 + (len(facebook_urls_batch) * 0.013):.4f} USD")
    
    try:
        run = client.actor("oJ48ceKNY7ueGPGL0").call(run_input=run_input)
        
        print(f"  Run ID: {run['id']}")
        print(f"  Status: {run.get('status', 'RUNNING')}")
        
        results = []
        for item in client.dataset(run["defaultDatasetId"]).iterate_items():
            results.append(item)
        
        print(f"  Retrieved {len(results)}/{len(facebook_urls_batch)} results from batch")
        return results, None
        
    except Exception as e:
        print(f"  Batch error: {str(e)}")
        return [], str(e)


# Process all rows
if len(facebook_only_df) > 0:
    print("=" * 80)
    print(f"FACEBOOK SCRAPER - MAXIMUM COST OPTIMIZATION (50-Batch Config)")
    print("=" * 80)
    print(f"Total Facebook pages: {len(facebook_only_df)}")
    print(f"Batch size: {BATCH_SIZE} pages per batch")
    print(f"Concurrency: {MAX_CONCURRENCY} pages scraped simultaneously per batch")
    
    # Calculate cost savings
    num_batches_50 = (len(facebook_only_df) + 49) // 50
    num_batches_10 = (len(facebook_only_df) + 9) // 10
    startup_cost_50 = num_batches_50 * 0.006
    startup_cost_10 = num_batches_10 * 0.006
    savings_pct = ((startup_cost_10 - startup_cost_50) / startup_cost_10 * 100) if startup_cost_10 > 0 else 0
    
    print(f"Estimated total cost: ${(len(facebook_only_df) // BATCH_SIZE + 1) * 0.006 + len(facebook_only_df) * 0.013:.4f} USD")
    print(f"\nCost Optimization vs 10-Batch Config:")
    print(f"   {savings_pct:.0f}% fewer actor runs ({num_batches_50} vs {num_batches_10} batches)")
    print(f"   Startup cost savings: ${startup_cost_10 - startup_cost_50:.4f} USD")
    print("=" * 80)
    
    total_successful = 0
    total_failed = 0
    total_phones_found = 0
    total_cost_estimate = 0.0
    
    total_rows = len(facebook_only_df)
    num_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
    
    for batch_idx in range(0, total_rows, BATCH_SIZE):
        batch = facebook_only_df.iloc[batch_idx:batch_idx + BATCH_SIZE]
        facebook_urls = [str(row['Facebook']).strip() for _, row in batch.iterrows()]
        batch_indices = list(batch.index)
        
        print(f"\n{'-'*80}")
        print(f"Batch {(batch_idx//BATCH_SIZE)+1}/{num_batches}")
        
        items, error = run_facebook_scraper_batch(client, facebook_urls)
        
        batch_cost = 0.006 + (len(facebook_urls) * 0.013)
        total_cost_estimate += batch_cost
        
        if error:
            print(f"  Batch failed, marking all {len(batch_indices)} pages as None")
            for idx in batch_indices:
                facebook_only_df.loc[idx, 'Phones'] = None
            total_failed += len(batch_indices)
            continue
        
        # Map results back to dataframe
        url_to_item = {}
        for item in items:
            fb_url = item.get('facebookUrl') or item.get('url') or item.get('pageUrl')
            if fb_url:
                normalized_url = fb_url.lower().strip().rstrip('/')
                url_to_item[normalized_url] = item
        
        print(f"\n  Processing batch results:")
        for idx, row in batch.iterrows():
            original_url = str(row['Facebook']).strip()
            normalized_search = original_url.lower().strip().rstrip('/')
            
            item = url_to_item.get(normalized_search)
            
            if item:
                raw_phone = item.get('phone', None) or item.get('wa_number', None) or item.get('mobile', None)
                phone = validate_singapore_number(raw_phone)
                
                if phone:
                    facebook_only_df.loc[idx, 'Phones'] = phone
                    print(f"    OK {original_url[:50]}... -> {phone}")
                    total_phones_found += 1
                    total_successful += 1
                else:
                    facebook_only_df.loc[idx, 'Phones'] = None
                    print(f"    WARN {original_url[:50]}... -> No valid SG number found")
                    total_successful += 1
            else:
                facebook_only_df.loc[idx, 'Phones'] = None
                print(f"    FAIL {original_url[:50]}... -> Not found in results")
                total_failed += 1
        
        if batch_idx + BATCH_SIZE < total_rows:
            print(f"\n  Waiting 3 seconds before next batch...")
            time.sleep(3)
    
    print("\n" + "=" * 80)
    print("FINAL SCRAPING SUMMARY:")
    print("=" * 80)
    print(f"  Total pages processed: {total_rows}")
    print(f"  Successful scrapes: {total_successful}")
    print(f"  Failed scrapes: {total_failed}")
    print(f"  Valid phone numbers found: {total_phones_found}")
    print(f"  Success rate: {(total_successful/total_rows)*100:.1f}%")
    print(f"  Phone found rate: {(total_phones_found/total_rows)*100:.1f}%")
    print(f"  Estimated cost: ${total_cost_estimate:.4f} USD")
    print(f"  Cost per phone found: ${total_cost_estimate/total_phones_found:.4f} USD" if total_phones_found > 0 else "  Cost per phone: N/A")
    print("=" * 80)
    
    display(facebook_only_df)
    
else:
    print("No data available in facebook_only_df")

In [None]:
df_with_phones = facebook_only_df[facebook_only_df["Phones"].notna()]

print(df_with_phones["Phones"].is_unique)


True


In [None]:
df_with_phones = facebook_only_df[facebook_only_df["Phones"].notna()]
df_without_phones = facebook_only_df[facebook_only_df["Phones"].isna()]

# Store unique phone numbers inside final_df_1
final_df_1 = df_with_phones[df_with_phones["Phones"].duplicated(keep=False) == False]

# Store duplicate phone numbers inside refilter_df_1
refilter_df_1 = df_with_phones[df_with_phones["Phones"].duplicated(keep=False) == True]


In [None]:
df_without_phones_2 = pd.concat([refilter_df_1, df_without_phones], ignore_index=True)
df_without_phones_2.shape

In [17]:
df_without_phones_2

Unnamed: 0,UEN,Status,Error,Emails,Phones,Website,Facebook,LinkedIn,Instagram,TikTok,RecordOwl_Link,operational_street,operational_unit,operational_postal_code,operational_address,Phones_str
0,202503911K,success,,,,,https://www.facebook.com/thedetailingcoltd/,,[https://www.instagram.com/detailingco.pk/],,https://recordowl.com/company/the-detailingco-...,61 WOODLANDS INDUSTRIAL PARK E9 E9 PREMIUM,01-20,757047,61 WOODLANDS INDUSTRIAL PARK E9 E9 PREMIUM 01-...,
1,53459382L,success,,,,,https://m.facebook.com/100090788316126,,,,https://recordowl.com/company/vegas-motor-works,9002 TAMPINES STREET 93 TAMPINES INDUSTRIAL PA...,01-40,528836,9002 TAMPINES STREET 93 TAMPINES INDUSTRIAL PA...,
2,53471316E,success,,[admin@xgarage.com.my],,https://xgarage.com.my/,https://www.facebook.com/xgarage.xgarage,,[https://www.instagram.com/x.garage_/],,https://recordowl.com/company/x-garage,15 KAKI BUKIT ROAD 4 BARTLEY BIZ CENTRE,01-53,417808,15 KAKI BUKIT ROAD 4 BARTLEY BIZ CENTRE 01-53 ...,
3,53466996D,success,,,,,https://www.facebook.com/p/RMS-Motors-10006972...,,[https://www.instagram.com/rms.motors/],,https://recordowl.com/company/rms-motors,60 JALAN LAM HUAT CARROS CENTRE,03-27,737869,60 JALAN LAM HUAT CARROS CENTRE 03-27 Singapor...,
4,53482722E,success,,,,,https://www.facebook.com/PJsGarageLimerick/,[https://ie.linkedin.com/in/paul-hoffmann-2889...,"[https://www.instagram.com/pjgarage/, https://...",,https://recordowl.com/company/pjs-garage,412 BUKIT BATOK WEST AVENUE 4,10-320,650412,412 BUKIT BATOK WEST AVENUE 4 10-320 Singapore...,
5,53448674J,success,,[superjservicesltd@gmail.com],,,https://www.facebook.com/treesurgeonBath/,,[https://www.instagram.com/tree_surgeon_bath/],,https://recordowl.com/company/super-j-services,60 PAYA LEBAR ROAD PAYA LEBAR SQUARE,06-33,409051,60 PAYA LEBAR ROAD PAYA LEBAR SQUARE 06-33 Sin...,
6,53431738C,success,,,,,https://www.facebook.com/AtatcutzAppointment/p...,,,,https://recordowl.com/company/dailsun-autoservice,570A WOODLANDS AVENUE 1 CHAMPIONS COURT,07-888,731570,570A WOODLANDS AVENUE 1 CHAMPIONS COURT 07-888...,
7,53484790X,success,,[apexmotorworksandretoration@yahoo.com],,,https://m.facebook.com/61556350627595/,,,,https://recordowl.com/company/apex-motorworks,514 PASIR RIS STREET 52,06-91,510514,514 PASIR RIS STREET 52 06-91 Singapore 510514,
8,202505190W,success,,,,,https://www.facebook.com/Ignite%20Autowerkz,,[https://www.instagram.com/igniteautowerkz],,https://recordowl.com/company/ignite-werkz-pte...,1 KAKI BUKIT AVENUE 6 AUTOBAY @ KAKI BUKIT,02-61,417883,1 KAKI BUKIT AVENUE 6 AUTOBAY @ KAKI BUKIT 02-...,
9,53476477X,success,,,,,https://www.facebook.com/p/YL-Car-Accessories-...,,,,https://recordowl.com/company/yl-car-audio-acc...,160 SIN MING DRIVE SIN MING AUTOCITY,04-02,575722,160 SIN MING DRIVE SIN MING AUTOCITY 04-02 Sin...,


In [15]:
final_df_1.shape

(15, 16)

In [None]:
# RecordOwl_Leads.to_csv("./Fresh_Leads_19_11_25.csv")