In [54]:
import os
import re
import glob
import time
import asyncio
import requests
import aiohttp
import nest_asyncio

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup

from fuzzywuzzy import fuzz, process

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build


### with phone

In [67]:
parquet_path = "./Staging/Gold/spa_with_phone.parquet"
if os.path.exists(parquet_path):
    spa_without_phone = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(spa_without_phone)} rows from {parquet_path}")
    print(spa_without_phone.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

Loaded 2559 rows from ./Staging/Gold/spa_with_phone.parquet
(2559, 20)


### without phone

In [56]:
parquet_path = "./Staging/Gold/spa_without_phone_formatted.parquet"
if os.path.exists(parquet_path):
    spa_without_phone = pd.read_parquet(parquet_path, engine="fastparquet")
    print(f"Loaded {len(spa_without_phone)} rows from {parquet_path}")
    print(spa_without_phone.shape)
else:
    raise FileNotFoundError(f"Parquet file not found at {parquet_path}")

Loaded 4061 rows from ./Staging/Gold/spa_without_phone_formatted.parquet
(4061, 65)


In [57]:

print(spa_without_phone.columns.tolist())


['ePOS Code', 'Company Code', 'Date', 'ACRA REGISTERED NAME', 'Brand/Deal Name/Business Name', 'Sub Domain Link (If Lead is already available in Backend) Fill only when EPOS client', 'Tele Sales or MR (For KPI - Internal)', 'Name of the Market Researcher', 'Original Source (Marketing)', 'Marketing Source (Do not fill anything if the leads are from Hubspot, EPOS clients)', 'Company Registration date / Date Established', 'Company Registration Number (UEN)', 'Primary SSIC Code', 'Secondary SSIC Code', 'Hubspot ID (Company)', 'Hubspot ID(Deal)', 'Hubspot ID(Contact)', 'Website URL', 'Business Type', 'Facebook Page', 'Instagram URL', 'Linkedin URL', 'Tik Tok URL', 'Ownership Type', 'Parent Industry Type', 'Industry Type', 'Sub Industry', 'Business model', 'Presence of Multiple Outlets', 'Number of Outlets (Write in #)', 'Region', 'Planning Area', 'Business Location Type', 'Registered Address (Block & Street)', 'Registered Address (Unit #)', 'Registered Address (Postal code)', 'Operational A

In [58]:
spa_without_phone["Company Registration Number (UEN)"].is_unique

True

In [59]:
spa_without_phone['PIC NAME 1 Contact Number'] = None
spa_without_phone['PIC 1 Source'] = None


In [60]:
spa_without_phone

Unnamed: 0,ePOS Code,Company Code,Date,ACRA REGISTERED NAME,Brand/Deal Name/Business Name,Sub Domain Link (If Lead is already available in Backend) Fill only when EPOS client,Tele Sales or MR (For KPI - Internal),Name of the Market Researcher,Original Source (Marketing),"Marketing Source (Do not fill anything if the leads are from Hubspot, EPOS clients)",...,PIC NAME 3 Contact Number,PIC 3 email address,PIC 3 Source,FB/Insta/Tik Tok/Linkedin Contact,Current ePOS Client ?,"If ePOS Client, which product they are using?",Is this deal part of the Gov List?,Source from Market Researcher,Contact Number from Lusha?,Phone number Verified ?
0,,,13012026,LUCKY BEAUTY STUDIO PTE. LTD.,LUCKY BEAUTY STUDIO,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
1,,,13012026,JESSMINE BEAUTY EYLASH NAIL,JESSMINE BEAUTY EYLASH NAIL,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
2,,,13012026,NOOR SPA & BEAUTY PTE. LTD.,NOOR SPA & BEAUTY,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
3,,,13012026,REVIVISCO HAIR PTE. LTD.,REVIVISCO HAIR,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
4,,,13012026,D. LIKE HAIR AESTHETIC PTE. LTD.,D LIKE HAIR AESTHETIC,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4056,,,13012026,LINDA BEAUTY,LINDA BEAUTY,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
4057,,,13012026,SOULSLIM AESTHETICS,SOULSLIM AESTHETICS,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
4058,,,13012026,KAJOL BRIDALS,KAJOL BRIDALS,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,
4059,,,13012026,SA SA BEAUTY SALON PTE. LTD.,SA SA BEAUTY SALON,,TeleSales,Hazim,Offline Sources,Web Scrapping,...,,,,,No,,Gov List,"ACRA, Google Searches",,


In [None]:
# =============================================================================
# GOOGLE MAPS PHONE NUMBER SEARCH - COST OPTIMIZED
# =============================================================================

from apify_client import ApifyClient
import pandas as pd
import time
import re
from fuzzywuzzy import fuzz

# Initialize Apify client
client = ApifyClient("")

# COST-OPTIMIZED CONFIGURATION
BATCH_SIZE = 500
MAX_CONCURRENCY = 5
FUZZY_MATCH_THRESHOLD = 75

# ---- Input Data ----
companies_to_search = spa_without_phone.copy()


def validate_singapore_phone(phone):
    """Validates and standardizes Singapore phone numbers."""
    if not phone:
        return None
    cleaned = re.sub(r'[\s\-\(\)\.\|/\+]', '', str(phone))
    
    if cleaned.startswith('65') and len(cleaned) == 10:
        number_part = cleaned[2:]
        if re.match(r'^[689]\d{7}$', number_part):
            return f"+65{number_part}"
    elif re.match(r'^[689]\d{7}$', cleaned):
        return f"+65{cleaned}"
    return None


def create_search_queries(df):
    """Create search queries from ACRA REGISTERED NAME and address."""
    queries = []
    for idx, row in df.iterrows():
        entity_name = str(row.get('ACRA REGISTERED NAME', '')).strip()
        address = str(row.get('Registered Address (Block & Street)', '')).strip()
        
        if not entity_name or entity_name == 'nan':
            continue
        
        # Build search query with name and address
        if address and address != 'nan':
            search_query = f"{entity_name} {address} Singapore"
        else:
            search_query = f"{entity_name} Singapore"
        
        queries.append({
            'idx': idx,
            'entity_name': entity_name,
            'search_query': search_query
        })
    
    return queries


def run_google_places_scraper(client, search_queries_batch):
    """Run Apify Google Places scraper - COST OPTIMIZED."""
    
    search_strings = [q['search_query'] for q in search_queries_batch]
    
    run_input = {
        "searchStringsArray": search_strings,
        "maxCrawledPlacesPerSearch": 1,
        "scrapeContacts": False,
        "scrapePlaceDetailPage": False,
        "maxReviews": 0,
        "maxImages": 0,
        "scrapeImages": False,
        "scrapeReviewsPersonalData": False,
        "scrapeTableReservationProvider": False,
        "scrapeDirectories": False,
        "includeWebResults": False,
        "language": "en",
        "skipClosedPlaces": False,
        "deeperCityScrape": False,
        "oneReviewPerRow": False,
        "maxConcurrency": MAX_CONCURRENCY,
        "proxyConfig": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["SHADER"]
        },
    }
    
    try:
        run = client.actor("compass/crawler-google-places").call(run_input=run_input)
        
        if not run or not isinstance(run, dict) or 'id' not in run:
            return [], "API returned invalid response"
        
        run_client = client.run(run["id"])
        run_info = run_client.wait_for_finish()
        
        status = run_info.get('status', 'UNKNOWN')
        
        if status in ['FAILED', 'TIMED-OUT', 'ABORTED']:
            return [], f"Actor run {status}"
        
        if status == "SUCCEEDED" and "defaultDatasetId" in run:
            dataset = client.dataset(run["defaultDatasetId"])
            items = list(dataset.iterate_items())
            return items, None
        
        return [], f"Scraping failed: {status}"
        
    except Exception as e:
        return [], f"Error: {type(e).__name__}: {str(e)}"


def fuzzy_match_company(entity_name, google_results, threshold=FUZZY_MATCH_THRESHOLD):
    """Fuzzy match entity_name with Google Places results."""
    if not google_results or not entity_name:
        return None, 0
    
    entity_name_clean = entity_name.upper().strip()
    best_match = None
    best_score = 0
    
    for result in google_results:
        google_name = result.get('title', '') or result.get('name', '')
        if not google_name:
            continue
        
        google_name_clean = google_name.upper().strip()
        
        max_score = max(
            fuzz.ratio(entity_name_clean, google_name_clean),
            fuzz.partial_ratio(entity_name_clean, google_name_clean),
            fuzz.token_sort_ratio(entity_name_clean, google_name_clean),
            fuzz.token_set_ratio(entity_name_clean, google_name_clean)
        )
        
        if max_score > best_score:
            best_score = max_score
            best_match = result
    
    if best_score >= threshold:
        return best_match, best_score
    
    return None, best_score


# ---- Main Execution ----
search_queries = create_search_queries(companies_to_search)
total_queries = len(search_queries)
num_batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Processing {total_queries} queries in {num_batches} batch(es)...")

# Process in batches
all_results = []
phones_found = 0

for batch_idx in range(0, total_queries, BATCH_SIZE):
    batch = search_queries[batch_idx:batch_idx + BATCH_SIZE]
    batch_num = (batch_idx // BATCH_SIZE) + 1
    
    print(f"Batch {batch_num}/{num_batches}...")
    
    items, error = run_google_places_scraper(client, batch)
    
    if error:
        print(f"  Error: {error}")
        for query in batch:
            all_results.append({'idx': query['idx'], 'GMaps_Phone': None, 'GMaps_Status': 'error'})
        continue
    
    # Group results by search query
    results_by_query = {}
    for item in items:
        search_string = item.get('searchString', '')
        if search_string not in results_by_query:
            results_by_query[search_string] = []
        results_by_query[search_string].append(item)
    
    # Match results
    for query in batch:
        entity_name = query['entity_name']
        google_results = results_by_query.get(query['search_query'], [])
        
        if not google_results:
            all_results.append({'idx': query['idx'], 'GMaps_Phone': None, 'GMaps_Status': 'no_results'})
            continue
        
        best_match, score = fuzzy_match_company(entity_name, google_results)
        
        if best_match:
            raw_phone = best_match.get('phone') or best_match.get('phoneUnformatted')
            validated_phone = validate_singapore_phone(raw_phone) if raw_phone else None
            
            if validated_phone:
                phones_found += 1
            
            all_results.append({'idx': query['idx'], 'GMaps_Phone': validated_phone, 'GMaps_Status': 'matched'})
        else:
            all_results.append({'idx': query['idx'], 'GMaps_Phone': None, 'GMaps_Status': 'no_match'})
    
    if batch_num < num_batches:
        time.sleep(2)

# Update spa_without_phone directly
phones_updated = 0
for result in all_results:
    if result['GMaps_Phone'] and result['GMaps_Status'] == 'matched':
        idx = result['idx']
        if idx in spa_without_phone.index:
            spa_without_phone.loc[idx, 'PIC NAME 1 Contact Number'] = result['GMaps_Phone']
            spa_without_phone.loc[idx, 'PIC 1 Source'] = "Google"
            phones_updated += 1

# Final Summary
print(f"\nComplete: {phones_found} phones found, {phones_updated} updated")


Processing 4061 queries in 9 batch(es)...
Batch 1/9...


[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> 2026-01-13T04:12:48.248Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> 2026-01-13T04:12:48.249Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> 2026-01-13T04:12:48.288Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> 2026-01-13T04:12:48.290Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> 2026-01-13T04:12:50.036Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:Q6IpkD3mNwh1qiwq0][0m -> 2026-01-13T04:12:50.600Z [32mINFO[3

Batch 2/9...


[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> 2026-01-13T04:17:57.152Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> 2026-01-13T04:17:57.154Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> 2026-01-13T04:17:57.207Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> 2026-01-13T04:17:57.208Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> 2026-01-13T04:17:58.943Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:KSh2Oo2A7rbd7nG6s][0m -> 2026-01-13T04:17:59.354Z [32mINFO[3

Batch 3/9...


[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> 2026-01-13T04:21:54.248Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> 2026-01-13T04:21:54.249Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> 2026-01-13T04:21:54.316Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> 2026-01-13T04:21:54.317Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> 2026-01-13T04:21:56.050Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:aEBPPB4Myy2OkWBkw][0m -> 2026-01-13T04:21:56.430Z [32mINFO[3

Batch 4/9...


[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> 2026-01-13T04:26:03.477Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> 2026-01-13T04:26:03.478Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> 2026-01-13T04:26:03.610Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> 2026-01-13T04:26:03.611Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> 2026-01-13T04:26:04.999Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:o2J6iRYeTsrTPfi6m][0m -> 2026-01-13T04:26:05.347Z [32mINFO[3

Batch 5/9...


[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> 2026-01-13T04:30:40.338Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> 2026-01-13T04:30:40.340Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> 2026-01-13T04:30:40.453Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> 2026-01-13T04:30:40.455Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> 2026-01-13T04:30:43.085Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:lpWHG0d4JNpbgsG5g][0m -> 2026-01-13T04:30:43.605Z [32mINFO[3

Batch 6/9...


[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> 2026-01-13T04:34:53.606Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> 2026-01-13T04:34:53.608Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> 2026-01-13T04:34:53.647Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> 2026-01-13T04:34:53.648Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> 2026-01-13T04:34:55.539Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:jRVV82Xuksm6AWmzx][0m -> 2026-01-13T04:34:56.050Z [32mINFO[3

Batch 7/9...


[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> 2026-01-13T04:39:03.494Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> 2026-01-13T04:39:03.496Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> 2026-01-13T04:39:03.585Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> 2026-01-13T04:39:03.587Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> 2026-01-13T04:39:05.651Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:3zsaYIUqoCyQjAJ6K][0m -> 2026-01-13T04:39:06.108Z [32mINFO[3

Batch 8/9...


[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> 2026-01-13T04:43:18.892Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> 2026-01-13T04:43:18.894Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> 2026-01-13T04:43:18.978Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> 2026-01-13T04:43:18.978Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> 2026-01-13T04:43:20.954Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:qy47zeGEVjybAh1nf][0m -> 2026-01-13T04:43:21.370Z [32mINFO[3

Batch 9/9...


[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> Status: RUNNING, Message: 
[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> 2026-01-13T04:47:38.865Z ACTOR: Pulling container image of build amjNKYcdQsAvuaohs from registry.
[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> 2026-01-13T04:47:38.866Z ACTOR: Creating container.
[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> 2026-01-13T04:47:38.950Z ACTOR: Starting container.
[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> 2026-01-13T04:47:38.951Z ACTOR: Running under "LIMITED_PERMISSIONS" permission level.
[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> 2026-01-13T04:47:40.326Z [32mINFO[39m  System info[90m {"apifyVersion":"3.5.1","apifyClientVersion":"2.19.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v18.20.8"}[39m
[36m[apify.crawler-google-places runId:NGwB3Z0ulMkJrbNgs][0m -> 2026-01-13T04:47:40.702Z [32mINFO[3


Complete: 849 phones found, 849 updated


In [62]:
spa_without_phone[["PIC NAME 1 Contact Number", "PIC 1 Source"]]

Unnamed: 0,PIC NAME 1 Contact Number,PIC 1 Source
0,+6596401300,Google
1,,
2,,
3,,
4,,
...,...,...
4056,+6593469776,Google
4057,,
4058,,
4059,+6565323326,Google


In [63]:
# Check if each column has all unique values
print("PIC NAME 1 Contact Number unique:", spa_without_phone["PIC NAME 1 Contact Number"].is_unique)
print("PIC 1 Source unique:", spa_without_phone["PIC 1 Source"].is_unique)

# Check value counts to see what's in them
print("\nPIC NAME 1 Contact Number value counts:")
print(spa_without_phone["PIC NAME 1 Contact Number"].value_counts(dropna=False))

print("\nPIC 1 Source value counts:")
print(spa_without_phone["PIC 1 Source"].value_counts(dropna=False))


PIC NAME 1 Contact Number unique: False
PIC 1 Source unique: False

PIC NAME 1 Contact Number value counts:
PIC NAME 1 Contact Number
None           3212
+6592315920       3
+6589039259       3
+6588598922       3
+6587603743       2
               ... 
+6562458696       1
+6586609905       1
+6565131297       1
+6564520755       1
+6566105358       1
Name: count, Length: 802, dtype: int64

PIC 1 Source value counts:
PIC 1 Source
None      3212
Google     849
Name: count, dtype: int64


In [66]:
# Check different states
total = len(spa_without_phone)
na_count = spa_without_phone["PIC NAME 1 Contact Number"].isna().sum()
empty_count = (spa_without_phone["PIC NAME 1 Contact Number"] == "").sum()
has_value = total - na_count - empty_count

print(f"Total rows: {total}")
print(f"NA/Null: {na_count}")
print(f"Empty string: {empty_count}")
print(f"Has value: {has_value}")


Total rows: 4061
NA/Null: 3212
Empty string: 0
Has value: 849


In [64]:
spa_without_phone.to_parquet("./Staging/Gold/spa_without_phone_formatted_scrapped_google.parquet", index=False, engine="fastparquet")

In [None]:
# Separate into 2 dataframes based on PIC NAME 1 Contact Number
# Load the scraped data
spa_without_phone_formatted_scrapped_google = pd.read_parquet("./Staging/Gold/spa_without_phone_formatted_scrapped_google.parquet", engine="fastparquet")

# With phone: PIC NAME 1 Contact Number is not NA/null/empty string
with_phone = spa_without_phone_formatted_scrapped_google[
    spa_without_phone_formatted_scrapped_google["PIC NAME 1 Contact Number"].notna() & 
    (spa_without_phone_formatted_scrapped_google["PIC NAME 1 Contact Number"] != "")
].copy()

# Without phone: PIC NAME 1 Contact Number is NA/null/empty string
without_phone = spa_without_phone_formatted_scrapped_google[
    spa_without_phone_formatted_scrapped_google["PIC NAME 1 Contact Number"].isna() | 
    (spa_without_phone_formatted_scrapped_google["PIC NAME 1 Contact Number"] == "")
].copy()

print(f"Total rows: {len(spa_without_phone_formatted_scrapped_google)}")
print(f"With phone: {len(with_phone)}")
print(f"Without phone: {len(without_phone)}")

In [65]:
# # Clean GMaps_Results - keep only UEN, ENTITY_NAME, Phone (use GMaps_Phone if available, else Original_Phone)
# gmap_data_clean = GMaps_Results.copy()

# def clean_phone(phone):
#     """Standardize phone to 10 digits with 65 prefix"""
#     if pd.isna(phone) or not phone:
#         return None
#     phone_str = str(phone).strip()
#     # Remove +, spaces, dashes
#     phone_str = phone_str.replace("+", "").replace(" ", "").replace("-", "")
#     # Add 65 prefix if only 8 digits
#     if len(phone_str) == 8:
#         phone_str = "65" + phone_str
#     return phone_str

# # Replace Original_Phone with GMaps_Phone where GMaps_Phone has a value
# gmap_data_clean["Phone"] = gmap_data_clean.apply(
#     lambda row: clean_phone(row["GMaps_Phone"]) if pd.notna(row["GMaps_Phone"]) and row["GMaps_Phone"] else clean_phone(row["Original_Phone"]),
#     axis=1
# )

# # Keep only required columns
# gmap_data_clean = gmap_data_clean[["UEN", "ENTITY_NAME", "Phone"]]

# print(f"Cleaned GMaps data: {len(gmap_data_clean)} rows")
# gmap_data_clean