In [67]:
import requests
import pandas as pd
import numpy as np
import glob
import os
import re
import time
import pandas as pd
from apify_client import ApifyClient
from bs4 import BeautifulSoup
import re
import json
from requests.exceptions import HTTPError, ConnectionError
from urllib3.exceptions import ProtocolError
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI


### Getting Master DB Data

In [68]:

# --- CONFIG ---
file_path = "./Master DB/Master_DB_oct22.xlsx"

# --- HELPER FUNCTIONS ---
def clean_uen(u: str) -> str | None:
    if pd.isna(u):
        return None
    return re.sub(r"[^A-Z0-9]", "", str(u).upper().strip())

def clean_text(text: str) -> str | None:
    if pd.isna(text):
        return None
    text = str(text).strip().upper()
    return None if text == "NAN" else text

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Convert all column names to uppercase, replace non-alphanumeric with single underscore, remove trailing underscores."""
    new_cols = []
    for col in df.columns:
        col_std = re.sub(r"[^A-Z0-9]", "_", col.upper().strip())
        col_std = re.sub(r"_+", "_", col_std)  # Replace multiple underscores with single
        col_std = col_std.strip("_")  # Remove leading/trailing underscores
        new_cols.append(col_std)
    df.columns = new_cols
    return df

# --- LOAD DATA ---
master_db_df = pd.read_excel(file_path)

# --- SELECT RELEVANT COLUMNS ---
columns_to_keep = [
    "Company Registration Number (UEN)",
    "ACRA REGISTERED NAME",
    "Brand/Deal Name/Business Name",
    "Primary SSIC Code",
    "PIC NAME 1 Contact Number",
    "PIC 1 email address",
    "Website URL",
    "Parent Industry Type",
    "Sub Industry"
]
master_db_df = master_db_df[columns_to_keep].copy()

# --- STANDARDIZE COLUMN NAMES ---
master_db_df = standardize_columns(master_db_df)

# --- CLEANING & RENAME SPECIFIC COLUMNS ---
# Dynamically find the UEN column (first column containing 'UEN')
uen_col = [c for c in master_db_df.columns if "UEN" in c][0]
master_db_df["UEN"] = master_db_df[uen_col].apply(clean_uen)
master_db_df = master_db_df.drop(columns=[uen_col])

# Rename other columns consistently
rename_map = {
    "BRAND_DEAL_NAME_BUSINESS_NAME": "BRAND_NAME",
    "PRIMARY_SSIC_CODE": "SSIC_CODE",
    "ACRA_REGISTERED_NAME": "ACRA_REGISTERED_NAME"
}
master_db_df = master_db_df.rename(columns={k: v for k, v in rename_map.items() if k in master_db_df.columns})

# Clean text columns
for col in ["ACRA_REGISTERED_NAME", "BRAND_NAME"]:
    if col in master_db_df.columns:
        master_db_df[col] = master_db_df[col].apply(clean_text)

# Convert SSIC_CODE to integer if exists
if "SSIC_CODE" in master_db_df.columns:
    master_db_df["SSIC_CODE"] = master_db_df["SSIC_CODE"].astype("Int64")

# Keep only required columns if they exist
required_cols = ["UEN", "ACRA_REGISTERED_NAME", "BRAND_NAME", "SSIC_CODE"]
master_db_df = master_db_df[[c for c in required_cols if c in master_db_df.columns]]

# Filter out rows with missing or empty UEN
master_db_df = master_db_df[master_db_df["UEN"].notna() & (master_db_df["UEN"].str.strip() != "")]

master_db_df


Unnamed: 0,UEN,ACRA_REGISTERED_NAME,BRAND_NAME,SSIC_CODE
0,04799400B,AIK BEE TEXTILE CO,AIK BEE TEXTILE CO,46411
1,03376200K,SERANGOON GARDEN CLINIC AND DISPENSARY,GARDEN CLINIC,550263
2,06239600E,SALON DE BENZIMEN,SALON DE BENZIMEN,96021
3,06952000C,SU LAN LADIES FASHION,SU LAN LADIES FASHION,14103
4,10381600C,SIN HAI PRINTING SERVICE,SIN HAI PRINTING SERVICE,18113
...,...,...,...,...
7444,201734006N,MISTER MOBILE HOUGANG PTE. LTD.,MISTER MOBILE (HOUGANG),95120
7445,202210879W,MISTER MOBILE CHINATOWN PTE. LTD.,MISTER MOBILE (CHINATOWN),47411
7446,202205507G,MISTER MOBILE PTE. LTD.,MISTER MOBILE HQ,64202
7454,53473046M,BLOONIES,BLOONIES,47742


### Getting ACRA Data (Filter by Live, Live Company only & non relevant ssic code)

In [69]:
# -------------------------------------------------------------
# Folder containing your CSVs
# -------------------------------------------------------------
folder_path = "Acra_Data"

# Get all CSV file paths inside the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read and combine all CSVs
# Using low_memory=False to avoid DtypeWarning for mixed types
df = pd.concat((pd.read_csv(f, low_memory=False) for f in csv_files), ignore_index=True)

# -------------------------------------------------------------
# Convert all column names to uppercase
# -------------------------------------------------------------
df.columns = df.columns.str.upper()


# -------------------------------------------------------------
# Select relevant columns (now in uppercase)
# -------------------------------------------------------------
acra_data = df[[
    "UEN",
    "ENTITY_NAME",
    "BUSINESS_CONSTITUTION_DESCRIPTION",
    "ENTITY_TYPE_DESCRIPTION",
    "ENTITY_STATUS_DESCRIPTION",
    "REGISTRATION_INCORPORATION_DATE",
    "PRIMARY_SSIC_CODE",
    "STREET_NAME",
    "POSTAL_CODE"
]].copy()

# -------------------------------------------------------------
# Convert to proper data types
# -------------------------------------------------------------
acra_data['UEN'] = acra_data['UEN'].astype('string')
acra_data['ENTITY_NAME'] = acra_data['ENTITY_NAME'].astype('string')
acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'] = acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'].astype('string')
acra_data['ENTITY_TYPE_DESCRIPTION'] = acra_data['ENTITY_TYPE_DESCRIPTION'].astype('string')
acra_data['ENTITY_STATUS_DESCRIPTION'] = acra_data['ENTITY_STATUS_DESCRIPTION'].astype('string')
acra_data['REGISTRATION_INCORPORATION_DATE'] = pd.to_datetime(acra_data['REGISTRATION_INCORPORATION_DATE'], errors='coerce')

# -------------------------------------------------------------
# Clean string columns ‚Äî trim, remove extra spaces, uppercase
# -------------------------------------------------------------
for col in [
    'UEN',
    'ENTITY_NAME',
    'BUSINESS_CONSTITUTION_DESCRIPTION',
    'ENTITY_TYPE_DESCRIPTION',
    'ENTITY_STATUS_DESCRIPTION',
    'STREET_NAME',
    'POSTAL_CODE'
]:
    acra_data[col] = (
        acra_data[col]
        .fillna('')
        .str.strip()
        .str.replace(r'\s+', ' ', regex=True)
        .str.upper()
    )

# -------------------------------------------------------------
# Replace placeholders with NaN for standardization
# -------------------------------------------------------------
acra_data.replace(['NA', 'N/A', '-', ''], np.nan, inplace=True)

# -------------------------------------------------------------
# Convert registration date to dd-mm-yyyy string (optional)
# -------------------------------------------------------------
acra_data['REGISTRATION_INCORPORATION_DATE'] = acra_data['REGISTRATION_INCORPORATION_DATE'].dt.strftime('%d-%m-%Y')

# -------------------------------------------------------------
# Filter only live entities (LIVE COMPANY or LIVE)
# -------------------------------------------------------------
acra_data = acra_data[
    acra_data['ENTITY_STATUS_DESCRIPTION'].isin(['LIVE COMPANY', 'LIVE'])
].reset_index(drop=True)

# -------------------------------------------------------------
# Exclude specific PRIMARY_SSIC_CODE values (supposedly the data would be 600k plus but when we exclude this would lessen)
# -------------------------------------------------------------
exclude_codes = [
    46900, 47719, 47749, 47539, 47536, 56123,
    10711, 10712, 10719, 10732, 10733, 93209
]

acra_data = acra_data[~acra_data['PRIMARY_SSIC_CODE'].isin(exclude_codes)].reset_index(drop=True)

In [70]:
acra_data

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,ANG MO KIO INDUSTRIAL PARK 2A,568049
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,WOODLANDS STREET 12,738623
...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,KELANTAN LANE,200031
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,YISHUN INDUSTRIAL STREET 1,768161
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,SIN MING LANE,573969
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,JALAN BAHAGIA,320034


### Getting SSIC Industry code

In [71]:
# --- CONFIG ---
file_path = "./SSIC_Code/mapped_ssic_code.xlsx"

# --- LOAD DATA ---
mapped_ssic_code = pd.read_excel(file_path)

# --- STANDARDIZE COLUMN NAMES ---
# Uppercase, strip spaces, replace spaces with underscores
mapped_ssic_code.columns = (
    mapped_ssic_code.columns
    .str.strip()
    .str.upper()
    .str.replace(" ", "_")
)

# --- KEEP ONLY DESIRED COLUMNS ---
columns_to_keep = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "SSIC_CODES", "DESCRIPTION"]
mapped_ssic_code = mapped_ssic_code[columns_to_keep].copy()

# --- CLEAN SSIC_CODES COLUMN ---
mapped_ssic_code["SSIC_CODES"] = (
    pd.to_numeric(mapped_ssic_code["SSIC_CODES"], errors="coerce")  # safely convert to numeric
    .fillna(0)
    .astype(int)
)

# --- CLEAN TEXT COLUMNS ---
text_cols = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "DESCRIPTION"]
mapped_ssic_code[text_cols] = mapped_ssic_code[text_cols].apply(
    lambda col: col.astype(str).str.strip().str.title()
)

# --- REMOVE DUPLICATES & RESET INDEX ---
mapped_ssic_code = mapped_ssic_code.drop_duplicates().reset_index(drop=True)

mapped_ssic_code.head()


Unnamed: 0,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,SSIC_CODES,DESCRIPTION
0,Retail,Retail,Fashion & Apparel,47711,Retail Sale Of Clothing For Adults
1,Retail,Retail,Fashion & Apparel,47712,Retail Sale Of Children And Infants' Clothing
2,Retail,Retail,Fashion & Apparel,47715,Retail Sale Of Sewing And Clothing Accessories
3,Retail,Retail,Fashion & Apparel,47719,"Retail Sale Of Clothing, Footwear And Leather ..."
4,Retail,Retail,Fashion & Apparel,47510,Retail Sale Of Textiles


### Merge ACRA data with SSIC code

In [72]:
# Convert PRIMARY_SSIC_CODE to int
acra_data["PRIMARY_SSIC_CODE"] = (
    pd.to_numeric(acra_data["PRIMARY_SSIC_CODE"], errors="coerce")
    .fillna(0)
    .astype(int)
)

# Merge based on SSIC code
acra_data_filtered = acra_data.merge(
    mapped_ssic_code,
    how="left",
    left_on="PRIMARY_SSIC_CODE",
    right_on="SSIC_CODES"
)

# Optional: drop the duplicate 'SSIC CODES' column (keep only PRIMARY_SSIC_CODE)
acra_data_filtered = acra_data_filtered.drop(columns=["SSIC_CODES"], errors="ignore")


In [73]:
acra_data_filtered

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Livestock, Meat, Poultry, Eggs An..."
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,ANG MO KIO INDUSTRIAL PARK 2A,568049,Others,Manufacturing,Other Specialised Manufacturing & Distribution,Other Manufacturing Industries N.E.C.
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,WOODLANDS STREET 12,738623,Retail,Retail,Fashion & Apparel,Retail Sale Of Clothing For Adults
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,KELANTAN LANE,200031,Services,Services,Hair Salons & Barbershops,Hairdressing Salons/Shops
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,YISHUN INDUSTRIAL STREET 1,768161,Others,Built Environment & Infrastructure,Construction,Renovation Contractors
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,SIN MING LANE,573969,Others,"Finance, Legal & Real Estate","Legal, Accounting & Consultancy Activities",Management Consultancy Services N.E.C.
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,JALAN BAHAGIA,320034,Others,"Tourism, Agency",Travel Agencies & Tour Operators,Travel Agencies And Tour Operators (Mainly Out...


### FIlter Acra data with Master DB to get list of companies havent been researched  by MR

In [74]:

# Ensure both UEN columns are strings for accurate matching
acra_data_filtered['UEN'] = acra_data_filtered['UEN'].astype(str).str.strip().str.upper()
master_db_df['UEN'] = master_db_df['UEN'].astype(str).str.strip().str.upper()

# Filter out rows in acra_data_filtered whose UEN is already in master_db_df
acra_data_filtered = acra_data_filtered[~acra_data_filtered['UEN'].isin(master_db_df['UEN'])]

acra_data_filtered.shape

(533824, 13)

### Filter by  Industry (Tuition/training service/child care)

In [75]:
# wholesale data
ssic_codes = [
    "85332","8536","85360","85403","85404","855","8550","85501","85502",
    "85503","85504","85505","85506","85507","85508","85509","856","8560",
    "85601","85602","85609","87022","8891","88911","88912","88991","96094"
]


acra_data_filtered_by_industry = acra_data_filtered[
    (
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live") |
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live company")
    )
    &
    (acra_data_filtered["PRIMARY_SSIC_CODE"].astype(str).isin(ssic_codes))
]


acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
859,198900817R,ALPHABET PLAYHOUSE CHILD CARE AND LEARNING CEN...,,LOCAL COMPANY,LIVE COMPANY,02-03-1989,88911,SAM LEONG ROAD,207922,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
983,199004029M,ACEWORLD HOLDINGS PTE LTD,,LOCAL COMPANY,LIVE COMPANY,17-08-1990,88911,SIAN TUAN AVENUE,588270,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
1040,199102118N,AVERBEL CHILD DEVELOPMENT CENTRE PTE LTD,,LOCAL COMPANY,LIVE COMPANY,09-05-1991,88911,YISHUN AVENUE 5,760742,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
1786,199609311Z,ALBERTON MANAGEMENT INSTITUTE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,28-12-1996,85501,SIN MING LANE,573969,Others,Educational,Industry-Specific Vocational & Professional Tr...,"Training Courses For Construction, Real Estate..."
2090,199901279H,AAYTOZEE @ HILLVIEW PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,17-03-1999,88911,JALAN DERMAWAN,668947,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537047,53497327B,ZYNTELLECT ADVISORY,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-01-2025,85509,CHESTNUT AVENUE,679524,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
537084,53501816M,ZAVIER TUITION SERVICES,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-03-2025,85509,UPPER THOMSON ROAD,574364,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
537208,T10LL0717A,ZEUS COMMUNICATIONS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,27-04-2010,96094,UPPER THOMSON ROAD,574329,Services,Services,Other Personal Service Activities,Training Of Pets
537226,T13LL0369C,ZEUS TALK LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,03-03-2013,96094,UPPER THOMSON ROAD,574329,Services,Services,Other Personal Service Activities,Training Of Pets


### Filter with Fresh Leads

In [76]:
# # --- Copy to avoid SettingWithCopyWarning ---
# acra_data_filtered_wholesale = acra_data_filtered_wholesale.copy()

# # --- UPDATE HERE: Remove rows if UEN exists in recordowl_results.xlsx ---
# recordowl_results = pd.read_excel("Fresh_Leads.xlsx")
# # Ensure both dataframes have a 'UEN' column
# if "UEN" in recordowl_results.columns and "UEN" in acra_data_filtered_wholesale.columns:
#     filtered = acra_data_filtered_wholesale[~acra_data_filtered_wholesale["UEN"].isin(recordowl_results["UEN"])]
# else:
#     raise ValueError("Column 'UEN' not found in one of the dataframes.")

# # sample data 
# acra_data_filtered_wholesale = filtered.sample(n=50, random_state=42).reset_index(drop=True)

# acra_data_filtered_wholesale.head()


In [77]:

# acra_data_filtered_wholesale = pd.DataFrame({
#     "UEN": ["201625008K"]
# })

In [80]:
# sample data 
acra_data_filtered_pilot = acra_data_filtered_by_industry.sample(n=10, random_state=42).reset_index(drop=True)

acra_data_filtered_pilot.head(10)


Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,202527176G,T & S ASPIRE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,24-06-2025,88912,LAVENDER STREET,338729,Others,Hospital,Social Services (Without Accommodations),Student Care Services; Child Minding Services ...
1,53462945L,REVISION BUDDY EDUTECH,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,14-02-2023,85509,NEW BRIDGE ROAD,59413,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
2,53414477B,KUTTIES CHUTTIES,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,05-06-2020,85509,TAMPINES AVENUE 9,524601,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
3,202409709K,HO KAANG INTERNATIONAL EDUCATION & CONSULTANCY...,,LOCAL COMPANY,LIVE COMPANY,12-03-2024,85509,TEMASEK BOULEVARD,38987,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
4,53502930J,AVERIE PLAYHOUSE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,22-04-2025,88911,YISHUN AVENUE 11,760417,Others,Hospital,Social Services (Without Accommodations),Infant Care Services; Child Minding Services F...
5,202210514D,KEYPATH EDUCATION SINGAPORE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,28-03-2022,85509,STRAITS VIEW,18937,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
6,53459592L,CREATIVE CAMPUS (EAST),SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,01-12-2022,85509,MARINE PARADE ROAD,449269,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
7,202016352N,LEARN WITH US PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,11-06-2020,85509,ANG MO KIO INDUSTRIAL PARK 2A,567760,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.
8,201418890W,LERUS ASIA PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,30-06-2014,88991,UBI CRESCENT,408568,Others,Hospital,Social Services (Without Accommodations),Job Training And Vocational Rehabilitation Ser...
9,201109789R,'X-FACTOR'! QUOTIENT PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,25-04-2011,85509,UPPER THOMSON ROAD,574424,Others,Educational,Tuition & Enrichment Centers,Training Courses N.E.C.


### Get Data from RecordOwl 

In [None]:

client = ApifyClient("apify_api_yNR85etaHpLtBzPoVozVVXUsCZe54u2Ffog1")

SOCIAL_MEDIA_DOMAINS = [
    "facebook.com", "linkedin.com", "instagram.com", "youtube.com",
    "tiktok.com", "twitter.com", "x.com", "pinterest.com"
]

def fetch_dataset_items_safe(dataset_client, max_retries=5, initial_wait=3):
    """Safely fetch dataset items with multiple retry strategies."""
    dataset_items = []
    
    for attempt in range(max_retries):
        try:
            # Strategy 1: Try using iterate_items() (streaming)
            try:
                dataset_items = list(dataset_client.iterate_items())
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)  # Exponential backoff
                    print(f"  ‚ö†Ô∏è Iteration method failed (attempt {attempt + 1}/{max_retries}), trying direct fetch in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ö†Ô∏è Iteration method failed after all retries, trying direct fetch...")
            
            # Strategy 2: Try using list_items() (direct pagination)
            try:
                offset = 0
                limit = 100
                while True:
                    page = dataset_client.list_items(offset=offset, limit=limit, clean=True)
                    if not page.items:
                        break
                    dataset_items.extend(page.items)
                    if len(page.items) < limit:
                        break
                    offset += limit
                
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)
                    print(f"  ‚ö†Ô∏è Direct fetch failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ùå All fetch methods failed: {e}")
                    return []
                    
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = initial_wait * (2 ** attempt)
                print(f"  ‚ö†Ô∏è Unexpected error (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"  ‚ùå Failed after all retries: {e}")
                return []
    
    return dataset_items

def run_apify_with_retry(client, run_input, uen, max_retries=3):
    """Run Apify with exponential backoff on 403 errors AND verify dataset has items."""
    for attempt in range(max_retries):
        try:
            print(f"  üì° Starting Apify run for {uen} (attempt {attempt + 1}/{max_retries})...")
            run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
            print(f"  ‚è≥ Waiting for run to complete...")
            run_client = client.run(run["id"])
            run_info = run_client.wait_for_finish()
            
            # CRITICAL FIX: Check if run actually scraped pages, not just if it "succeeded"
            if run_info and "status" in run_info:
                status = run_info.get("status")
                
                # Even if status is "SUCCEEDED", verify dataset actually has items
                if status == "SUCCEEDED" and "defaultDatasetId" in run:
                    # Quick check if dataset has any items
                    try:
                        dataset_check = client.dataset(run["defaultDatasetId"])
                        time.sleep(2)  # Brief wait for dataset to be ready
                        test_items = dataset_check.list_items(limit=1, clean=True)
                        
                        if test_items.items and len(test_items.items) > 0:
                            # Dataset has items - true success!
                            print(f"  ‚úÖ Run succeeded with data")
                            return run, None
                        else:
                            # Status says "SUCCEEDED" but dataset is EMPTY - this is a failure!
                            print(f"  ‚ö†Ô∏è Run completed but dataset is empty (likely 403 block)")
                            # Treat as 403 and retry
                            if attempt < max_retries - 1:
                                wait_time = 30 * (2 ** attempt)
                                print(f"  üîÑ Retrying in {wait_time}s...")
                                time.sleep(wait_time)
                                continue
                            else:
                                return None, "Dataset empty after all retries (403 blocking)"
                    except Exception as e:
                        print(f"  ‚ö†Ô∏è Could not verify dataset: {e}")
                        # If we can't check dataset, try to use the run anyway
                        return run, None
                
                elif status != "SUCCEEDED":
                    # Check error message for 403
                    error_msg = str(run_info)
                    if "403" in error_msg or "blocked" in error_msg.lower():
                        if attempt < max_retries - 1:
                            wait_time = 30 * (2 ** attempt)  # 30s, 60s, 120s
                            print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                            time.sleep(wait_time)
                            continue
            
            return run, None
            
        except Exception as e:
            error_str = str(e)
            if "403" in error_str or "blocked" in error_str.lower():
                if attempt < max_retries - 1:
                    wait_time = 30 * (2 ** attempt)
                    print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                    time.sleep(wait_time)
                    continue
            return None, f"Apify call failed: {str(e)}"
    
    return None, "Max retries exceeded due to 403 blocking"

all_results = []

for idx, (i, row) in enumerate(acra_data_filtered_pilot.iterrows(), 1):
    uen = str(row["UEN"]).strip()
    print(f"\nüîé Processing {uen} ({idx}/{len(acra_data_filtered_pilot)})")

    # Build pageFunction with proper escaping
    page_function = f"""
    async function pageFunction(context) {{
        const {{ page, log, request }} = context;
        const uen = "{uen}";
        log.info("Visiting RecordOwl for UEN: " + uen);

        try {{
            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ timeout: 30000 }});
            const input = await page.$("input[placeholder='Search company name, industry, or address']");
            await input.click({{ clickCount: 3 }});
            await input.type(uen, {{ delay: 100 }});

            await Promise.all([
                page.waitForNavigation({{ waitUntil: 'networkidle2', timeout: 60000 }}).catch(() => null),
                page.click("button[type='submit']")
            ]);

            // Wait for results with longer timeout
            try {{
                await page.waitForSelector("a[href*='/company/']", {{ timeout: 45000 }});
            }} catch (e) {{
                log.info("No company links found, might be not found");
                return {{ status: 'not_found', uen }};
            }}

            const companyLink = await page.$$eval("a[href*='/company/']", (links, uen) => {{
                for (const a of links) {{
                    const text = a.innerText || "";
                    const href = a.href || "";
                    if (text.includes(uen) || href.includes(uen.toLowerCase())) return a.href;
                }}
                return links.length > 0 ? links[0].href : null;
            }}, uen);

            if (!companyLink) return {{ status: 'not_found', uen }};

            if (page.url() !== companyLink) {{
                await page.goto(companyLink, {{ waitUntil: 'networkidle2', timeout: 60000 }});
            }}

            // Wait for critical content to load - phone numbers are often in dt/dd tags
            await Promise.race([
                page.waitForSelector('dt', {{ timeout: 10000 }}).catch(() => null),
                new Promise(r => setTimeout(r, 8000)) // Increased from 3s to 8s
            ]);
            
            // Additional wait to ensure all dynamic content loads
            await new Promise(r => setTimeout(r, 5000));
            
            const html_content = await page.content();
            const title = await page.title();
            const url = page.url();

            return {{ status: 'success', uen, url, title, html_content }};
        }} catch (err) {{
            log.error("Error in pageFunction: " + err.message);
            return {{ status: 'error', uen, error: err.message }};
        }}
    }}
    """

    run_input = {
        "startUrls": [{"url": "https://recordowl.com/"}],
        "useChrome": True,
        "headless": True,
        "stealth": True,
        "pageFunction": page_function,
        "ignoreSslErrors": False,
        "ignoreCorsAndCsp": False,
        "maxRequestRetries": 3,  # Increased retry attempts
        "maxRequestsPerCrawl": 1,  # One page per run
        "maxConcurrency": 1,  # No parallel requests
        "pageLoadTimeoutSecs": 90,  # Optimized timeout
        "pageFunctionTimeoutSecs": 180,  # 3 minutes for pageFunction
        "waitUntil": ["networkidle2"],  # Wait for network to be idle
        # OPTIMIZED: Residential proxies with recommended rotation
        "proxyConfiguration": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"],  # Residential IPs less likely to be blocked
        },
        "proxyRotation": "RECOMMENDED",  # Optimal proxy rotation strategy
    }

    # Use retry logic for 403 errors (5 attempts = more chances to recover)
    run, error = run_apify_with_retry(client, run_input, uen, max_retries=5)

    if error or not run:
        print(f"  ‚ùå Apify call failed for {uen}: {error}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": error or "No run returned"
        })
        time.sleep(10)  # Longer sleep after failure
        continue

    if not run or "defaultDatasetId" not in run:
        print(f"  ‚ö†Ô∏è No valid dataset returned for {uen}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": "No dataset returned"
        })
        continue

    # Wait for dataset to be ready with progressive checking
    print(f"  ‚è≥ Waiting for dataset to be ready...")
    time.sleep(5)  # Initial wait
    
    # Try to fetch dataset with progressive waits
    dataset_client = client.dataset(run["defaultDatasetId"])
    for check_attempt in range(3):
        try:
            # Quick check if dataset has items
            test_fetch = dataset_client.list_items(limit=1, clean=True)
            if test_fetch.items:
                break
        except:
            pass
        
        if check_attempt < 2:
            additional_wait = 3 * (check_attempt + 1)
            print(f"  ‚è≥ Dataset not ready, waiting {additional_wait}s more...")
            time.sleep(additional_wait)
    
    scraped_html, record_owl_url = None, None
    
    # Fetch dataset items with improved error handling
    dataset_items = fetch_dataset_items_safe(
        dataset_client,
        max_retries=5,
        initial_wait=5  # Increased from 3 to 5
    )
    
    # Process items
    if not dataset_items:
        print(f"  ‚ö†Ô∏è Dataset is empty - no items returned!")
    else:
        print(f"  üìä Dataset has {len(dataset_items)} item(s)")
    
    for item in dataset_items:
        if item.get("status") == "success":
            scraped_html = item.get("html_content", "")
            record_owl_url = item.get("url")
            if scraped_html:
                print(f"  ‚úÖ Successfully scraped {uen} ({len(scraped_html)} chars of HTML)")
            else:
                print(f"  ‚ö†Ô∏è Status is 'success' but html_content is empty for {uen}")
        elif item.get("status") == "not_found":
            print(f"  ‚ö†Ô∏è Company not found for UEN {uen}")
        elif item.get("status") == "error":
            print(f"  ‚ùå Error for {uen}: {item.get('error')}")
        else:
            print(f"  ‚ö†Ô∏è Unknown item status for {uen}: {item.get('status')}")
            print(f"  üìã Item keys: {list(item.keys())}")

    if not scraped_html:
        # Determine the specific reason for failure
        if not dataset_items:
            error_reason = "Dataset empty (likely 403 block at Apify level)"
            print(f"  ‚ùå {error_reason}")
        elif any(item.get("status") == "not_found" for item in dataset_items):
            error_reason = "Company not found on RecordOwl"
            print(f"  ‚ùå {error_reason}")
        elif any(item.get("status") == "error" for item in dataset_items):
            error_details = [item.get("error", "Unknown") for item in dataset_items if item.get("status") == "error"]
            error_reason = f"Scraping error: {error_details[0] if error_details else 'Unknown'}"
            print(f"  ‚ùå {error_reason}")
        else:
            error_reason = "No HTML content retrieved (unknown reason)"
            print(f"  ‚ö†Ô∏è {error_reason}")
            # Debug: show what's in dataset items
            if dataset_items:
                print(f"  üîç DEBUG - First item: {dataset_items[0]}")
        
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": error_reason
        })
        time.sleep(5)
        continue

    # Parse HTML
    try:
        soup = BeautifulSoup(scraped_html, "html.parser")
        parent = soup.select_one("div.max-w-7xl.mx-auto.lg\\:py-6.sm\\:px-6.lg\\:px-8")

        emails, phones, website = [], [], None
        facebook_links, linkedin_links, instagram_links, tiktok_links = [], [], [], []

        if parent:
            # Extract emails
            for a in parent.select("a[href^=mailto]"):
                email = a.get("href", "").replace("mailto:", "").strip()
                if email and email not in emails and "@" in email:
                    emails.append(email)

            # ========== COMPREHENSIVE PHONE EXTRACTION ==========
            # This extracts Singapore phone numbers with ANY spacing/formatting:
            # - "65 63 19 2960" (spaces between digits)
            # - "6563192960" (no spaces)
            # - "+65-6319-2960" (dashes)
            # - "65 6 3 1 9 2 9 6 0" (space between every digit)
            # - "(65) 6319 2960" (with parentheses)
            # Method: Extract ALL digits first, then validate pattern
            print(f"  üîç Searching for phone numbers...")
            
            # Method 1: Look for tel: links (most reliable)
            tel_links = parent.select("a[href^='tel:'], a[href^='tel']")
            if tel_links:
                print(f"  üì± Found {len(tel_links)} tel: links")
            
            for a in tel_links:
                tel_href = a.get("href", "").replace("tel:", "").strip()
                tel_text = a.get_text(strip=True)
                print(f"  üìû Tel link - href: '{tel_href}', text: '{tel_text}'")
                
                # Extract all digits from tel link
                digits_only = re.sub(r"\D", "", tel_href)
                print(f"  üî¢ Tel digits: {digits_only}")
                
                # Handle different digit lengths
                if len(digits_only) == 10 and digits_only.startswith("65") and digits_only[2] in "689":
                    # 10 digits starting with 65 (e.g., "6563192960")
                    formatted = "+" + digits_only
                    if formatted not in phones:
                        phones.append(formatted)
                        print(f"  ‚úÖ Added from tel link (10 digits): {formatted}")
                elif len(digits_only) == 8 and digits_only[0] in "689":
                    # 8 digits starting with 6/8/9 (e.g., "63192960")
                    formatted = "+65" + digits_only
                    if formatted not in phones:
                        phones.append(formatted)
                        print(f"  ‚úÖ Added from tel link (8 digits): {formatted}")
                elif len(digits_only) > 10:
                    # More than 10 digits, try to find valid pattern
                    print(f"  üîç Searching within {len(digits_only)} digits for valid pattern...")
                    found = False
                    # Look for 65 followed by 6/8/9
                    for i in range(len(digits_only) - 9):
                        if digits_only[i:i+2] == "65" and digits_only[i+2] in "689":
                            formatted = "+" + digits_only[i:i+10]
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from tel link (extracted): {formatted}")
                            found = True
                            break
                    if not found:
                        # Try last 8 digits if they start with 6/8/9
                        if digits_only[-8] in "689":
                            formatted = "+65" + digits_only[-8:]
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from tel link (last 8 digits): {formatted}")
            
            # Method 2: Look in dt/dd structure with broader keywords
            dt_tags = parent.select("dt")
            if dt_tags:
                print(f"  üìã Found {len(dt_tags)} dt tags")
            
            for dt in dt_tags:
                dt_text = dt.get_text(strip=True).lower()
                # Check for phone-related keywords but exclude non-phone fields
                exclude_keywords = ["officer", "charge", "employee", "shareholder", "director", "registration"]
                phone_keywords = ["contact number", "phone", "tel", "mobile", "call", "contact no"]
                
                is_phone_field = any(kw in dt_text for kw in phone_keywords)
                is_excluded = any(excl in dt_text for excl in exclude_keywords)
                
                if is_phone_field and not is_excluded:
                    dd = dt.find_next_sibling("dd")
                    if dd:
                        number_text = dd.get_text(" ", strip=True)
                        print(f"  üìù Field '{dt_text}': {number_text}")
                        
                        # Extract all digits and check if it forms a valid phone number
                        all_digits = re.sub(r"\D", "", number_text)
                        print(f"  üî¢ Extracted digits: {all_digits}")
                        
                        # Check for Singapore phone patterns in the digits
                        # Pattern 1: 10 digits starting with 65
                        if len(all_digits) == 10 and all_digits.startswith("65") and all_digits[2] in "689":
                            formatted = "+" + all_digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from dt/dd (10 digits): {formatted}")
                        # Pattern 2: 8 digits starting with 6, 8, or 9
                        elif len(all_digits) == 8 and all_digits[0] in "689":
                            formatted = "+65" + all_digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from dt/dd (8 digits): {formatted}")
                        # Pattern 3: More than 10 digits, try to extract 10-digit number starting with 65
                        elif len(all_digits) > 10:
                            # Look for 65 followed by 6/8/9 in the digit string
                            for i in range(len(all_digits) - 9):
                                if all_digits[i:i+2] == "65" and all_digits[i+2] in "689":
                                    potential_number = all_digits[i:i+10]
                                    formatted = "+" + potential_number
                                    if formatted not in phones:
                                        phones.append(formatted)
                                        print(f"  ‚úÖ Added from dt/dd (extracted): {formatted}")
                                    break
            
            # Method 3: Search entire parent for phone patterns if none found
            if not phones:
                print(f"  üîé No phones found yet, searching entire content...")
                full_text = parent.get_text()
                
                # Ultra-comprehensive patterns to catch ALL spacing variations
                # These patterns allow unlimited spaces/dashes between digits
                patterns = [
                    # Pattern 1: +65 with any spacing (e.g., "+65 6 3 1 9 2 9 6 0", "+65-6319-2960")
                    r"\+[\s\-]*65[\s\-]+[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d",
                    # Pattern 2: (65) with any spacing
                    r"\([\s\-]*65[\s\-]*\)[\s\-]*[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d",
                    # Pattern 3: 65 without + or () but with space/dash (e.g., "65 6 3 1 9 2 9 6 0", "65-6319-2960")
                    r"(?<!\d)65[\s\-]+[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d(?!\d)",
                    # Pattern 4: Just 8 digits starting with 6/8/9 with any spacing
                    r"(?<!\d)[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d(?!\d)",
                ]
                
                for pattern_idx, pattern in enumerate(patterns, 1):
                    matches = re.findall(pattern, full_text)
                    if matches:
                        print(f"  üîç Pattern {pattern_idx} found {len(matches)} potential matches")
                    
                    for match in matches:
                        # Extract only digits
                        digits = re.sub(r"\D", "", match)
                        print(f"  üî¢ Pattern {pattern_idx} match: '{match.strip()}' ‚Üí digits: '{digits}'")
                        
                        # Validate and format
                        if len(digits) == 10 and digits.startswith("65") and digits[2] in "689":
                            formatted = "+" + digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from pattern {pattern_idx} (10 digits): {formatted}")
                        elif len(digits) == 8 and digits[0] in "689":
                            formatted = "+65" + digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from pattern {pattern_idx} (8 digits): {formatted}")
                        elif len(digits) > 10:
                            # Try to find a valid 10-digit number within
                            for i in range(len(digits) - 9):
                                if digits[i:i+2] == "65" and digits[i+2] in "689":
                                    potential = digits[i:i+10]
                                    formatted = "+" + potential
                                    if formatted not in phones:
                                        phones.append(formatted)
                                        print(f"  ‚úÖ Added from pattern {pattern_idx} (extracted): {formatted}")
                                    break
            
            if phones:
                print(f"  ‚úÖ Total phones found: {phones}")
            else:
                print(f"  ‚ö†Ô∏è WARNING: No phone numbers found for {uen}")
                print(f"  üìÑ Showing first 500 chars of parent HTML for debugging:")
                print(parent.prettify()[:500] + "...")
            # ========== END PHONE EXTRACTION ==========

            # Extract website
            valid_websites = []
            for a in parent.select("a[href^=http]"):
                href = a.get("href", "").strip()
                href_lower = href.lower()
                if not any(domain in href_lower for domain in SOCIAL_MEDIA_DOMAINS):
                    if not any(skip in href_lower for skip in ["recordowl", "apify.com"]):
                        if any(tld in href for tld in [".com", ".sg", ".net", ".org", ".co"]):
                            valid_websites.append(href)
            website = valid_websites[0] if valid_websites else None

        # Extract social media links from entire page
        for a in soup.find_all("a", href=True):
            href = a["href"].strip().lower()
            if "facebook.com" in href and href not in facebook_links:
                facebook_links.append(href)
            elif "linkedin.com" in href and href not in linkedin_links:
                linkedin_links.append(href)
            elif "instagram.com" in href and href not in instagram_links:
                instagram_links.append(href)
            elif "tiktok.com" in href and href not in tiktok_links:
                tiktok_links.append(href)

        all_results.append({
            "UEN": uen,
            "Emails": emails if emails else None,
            "Phones": phones if phones else None,
            "Website": website,
            "Facebook": list(set(facebook_links)) if facebook_links else None,
            "LinkedIn": list(set(linkedin_links)) if linkedin_links else None,
            "Instagram": list(set(instagram_links)) if instagram_links else None,
            "TikTok": list(set(tiktok_links)) if tiktok_links else None,
            "RecordOwl_Link": record_owl_url,
        })
        print(f"  ‚úÖ Processed {uen}: {len(emails) if emails else 0} emails, {len(phones) if phones else 0} phones")
        
    except Exception as e:
        print(f"  ‚ùå Error parsing HTML for {uen}: {e}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": f"HTML parsing error: {str(e)}"
        })

    # Dynamic sleep time to avoid rate limiting and 403 blocks
    # Longer delays reduce detection and blocking
    base_sleep = 20  # Increased from 10
    random_addition = (idx % 10) + 5  # 5-14 seconds random
    sleep_time = base_sleep + random_addition  # 25-34 seconds total

    print(f"  üí§ Sleeping for {sleep_time}s before next request...")
    time.sleep(sleep_time)

    # Extra delay after every 5th request to further avoid detection
    if idx % 5 == 0:
        extra_wait = 30
        print(f"  üõë Checkpoint pause: waiting extra {extra_wait}s...")
        time.sleep(extra_wait)

New_Fresh_Leads = pd.DataFrame(all_results)
print("\n‚úÖ Scraping complete!")
print(f"\nüìä Results summary:")
print(f"   Total processed: {len(New_Fresh_Leads)}")
print(f"   With emails: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"   With phones: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"   With websites: {New_Fresh_Leads['Website'].notna().sum()}")
New_Fresh_Leads.head(10)


üîé Processing 202527176G (1/10)
  üì° Starting Apify run for 202527176G (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:jiDkDRdgLfbqQP0Ky][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:jiDkDRdgLfbqQP0Ky][0m -> 2025-11-06T05:18:55.773Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:jiDkDRdgLfbqQP0Ky][0m -> 2025-11-06T05:18:55.782Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:jiDkDRdgLfbqQP0Ky][0m -> 2025-11-06T05:18:55.813Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:jiDkDRdgLfbqQP0Ky][0m -> 2025-11-06T05:18:56.000Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:jiDkDRdgLfbqQP0Ky][0m -> 2025-11-06T05:18:57.373Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:jiD

  ‚ùå Apify call failed for 202527176G: Apify call failed: Expecting value: line 1 column 1 (char 0)

üîé Processing 53462945L (2/10)
  üì° Starting Apify run for 53462945L (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:hVxDYcusXZYqzY0Qb][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:hVxDYcusXZYqzY0Qb][0m -> 2025-11-06T05:20:09.307Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:hVxDYcusXZYqzY0Qb][0m -> 2025-11-06T05:20:09.312Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:hVxDYcusXZYqzY0Qb][0m -> 2025-11-06T05:20:11.835Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:hVxDYcusXZYqzY0Qb][0m -> 2025-11-06T05:20:12.362Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:hVxDYcusXZYqzY0Qb][0m -> 2025-11-06T05:20:13.282Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:hVx

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 53462945L (1473254 chars of HTML)
  üîç Searching for phone numbers...
  üìã Found 19 dt tags
  üîé No phones found yet, searching entire content...
  üìÑ Showing first 500 chars of parent HTML for debugging:
<div class="max-w-7xl mx-auto lg:py-6 sm:px-6 lg:px-8" style="height: auto !important;">
 <div class="flex flex-col lg:flex-row" style="height: auto !important;">
  <div class="w-full lg:w-2/3 lg:pr-8" style="height: auto !important;">
   <div class="lg:mb-4 border-b border-gray-200">
    <ul class="flex flex-nowrap overflow-x-auto -mb-px text-sm font-medium text-center scrollbar-hide" id="companyTabs" role="tablist">
     <li class="mr-2" role="presentation">
      <button aria-controls="overvi...
  ‚úÖ Processed 53462945L: 0 emails, 0 phones
  üí§ Sleeping for 27s before next request...

üîé Processing 53414477B

[36m[apify.puppeteer-scraper runId:fLsA6DNaZ5RUe5wLQ][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:fLsA6DNaZ5RUe5wLQ][0m -> 2025-11-06T05:21:22.901Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:fLsA6DNaZ5RUe5wLQ][0m -> 2025-11-06T05:21:22.903Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:fLsA6DNaZ5RUe5wLQ][0m -> 2025-11-06T05:21:23.034Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:fLsA6DNaZ5RUe5wLQ][0m -> 2025-11-06T05:21:23.405Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:fLsA6DNaZ5RUe5wLQ][0m -> 2025-11-06T05:21:24.212Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:fLs

In [None]:
New_Fresh_Leads

### Append and save into exel sheet

In [None]:

# # Load both Excel files
# file_path_1 = "Fresh_Leads.xlsx"
# Fresh_Leads = pd.read_excel(file_path_1)

# # file_path_2 = "recordowl_results_4.xlsx"
# # recordowl_results_4 = pd.read_excel(file_path_2)

# # Append (combine) them
# combined_df = pd.concat([Fresh_Leads, Fresh_Leads_with_phones], ignore_index=True)

# # Optional: Save to a new Excel file
# combined_df.to_excel("Fresh_Leads_New.xlsx", index=False)

# # Preview
# combined_df


In [None]:
# count_non_nan = combined_df['Phones'].notna().sum()
# print(count_non_nan)


### Website Scrapping

In [None]:
import httpx
import asyncio

# =====================================================
# Validate Website (only if no phone number)
# =====================================================
async def check_url(url: str) -> bool:
    """Return True if the URL is reachable (status < 400)."""
    if not url:
        return False
    try:
        async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
            response = await client.head(url)
            return response.status_code < 400
    except Exception:
        return False


async def validate_if_needed(df):
    """Validate websites only if phone number is missing."""
    for i, row in df.iterrows():
        url = row.get("Website")
        phone = row.get("Phones")

        # Skip validation if phone exists
        if phone:
            df.at[i, "Website_Valid"] = None
            continue

        # Validate website if no phone
        if url:
            is_valid = await check_url(url)
            df.at[i, "Website_Valid"] = "valid" if is_valid else "invalid"
        else:
            df.at[i, "Website_Valid"] = "invalid"

    return df


# =====================================================
# Run async validation safely inside Jupyter
# =====================================================
result_df = await validate_if_needed(result_df)

# =====================================================
# Final output
# =====================================================
display(result_df)


### If contact number is invalid, then webscrapped website to get contact number

In [None]:
import asyncio
import os
import time
from apify_client import ApifyClient

# --- Initialize Apify client ---
APIFY_TOKEN = os.getenv("APIFY_TOKEN", "apify_api_0HQ8fc5fw5T1aosdacxKQNQYVBAEwi3tXaJc")
client = ApifyClient(APIFY_TOKEN)

# --- Async wrapper so you can run in Jupyter ---
async def enrich_with_contact_info(df):
    """Scrape contact info for rows where Website_Valid == 'valid' and Phones is empty."""
    updated_df = df.copy()

    for i, row in df.iterrows():
        website = row.get("Website")
        status = row.get("Website_Valid")
        phone = row.get("Phones")

        if not website or status != "valid" or phone:
            continue  # Skip invalid or already complete rows

        print(f"üîç Scraping contact page for: {website}")

        # --- CONVERTED TO PUPPETEER-SCRAPER (same as Cell 20) ---
        # Now using native Puppeteer syntax instead of jQuery
        run_input = {
            "startUrls": [{"url": website}],
            "pageFunction": r"""
                async function pageFunction(context) {
                    const { page, log, request } = context;
                    const isContact = request.userData?.isContact || false;

                    // If not on contact page yet, try to find and navigate to it
                    if (!isContact) {
                        try {
                            // Wait for page to load
                            await page.waitForSelector('a', { timeout: 10000 }).catch(() => null);
                            
                            // Find contact page link using Puppeteer
                            const contactUrl = await page.evaluate(() => {
                                const links = Array.from(document.querySelectorAll('a[href]'));
                                for (const link of links) {
                                    const href = link.getAttribute('href');
                                    if (href && href.toLowerCase().includes('contact')) {
                                        return href.startsWith('http') ? href : window.location.origin + href;
                                    }
                                }
                                return null;
                            });

                            if (contactUrl) {
                                await context.enqueueRequest({ 
                                    url: contactUrl, 
                                    userData: { isContact: true } 
                                });
                                log.info(`Enqueued contact page: ${contactUrl}`);
                            }
                            return null;
                        } catch (err) {
                            log.error(`Error finding contact page: ${err.message}`);
                            return null;
                        }
                    }

                    // We're on the contact page - extract emails and phones
                    try {
                        // Wait for content to load
                        await new Promise(r => setTimeout(r, 3000));

                        // Extract emails and phones using Puppeteer
                        const contactData = await page.evaluate(() => {
                            // Helper: check if element is visible
                            function isVisible(el) {
                                return el && el.offsetParent !== null;
                            }

                            // Extract emails from mailto links
                            const emailLinks = Array.from(document.querySelectorAll('a[href^="mailto"]'));
                            const emails = emailLinks
                                .filter(el => isVisible(el))
                                .map(el => el.getAttribute('href').replace('mailto:', '').trim())
                                .filter(email => email.length > 0);

                            // Extract phones from tel links
                            const phoneLinks = Array.from(document.querySelectorAll('a[href^="tel"]'));
                            const phones = phoneLinks
                                .filter(el => isVisible(el))
                                .map(el => el.getAttribute('href').replace(/[^0-9]/g, ''))
                                .filter(phone => phone.length > 0);

                            return {
                                emails: [...new Set(emails)],
                                phones: [...new Set(phones)]
                            };
                        });

                        return {
                            contactUrl: request.url,
                            emails: contactData.emails.length ? contactData.emails : [],
                            phones: contactData.phones.length ? contactData.phones : []
                        };
                    } catch (err) {
                        log.error(`Error extracting contact data: ${err.message}`);
                        return {
                            contactUrl: request.url,
                            emails: [],
                            phones: [],
                            error: err.message
                        };
                    }
                }
            """,
            "useChrome": True,
            "headless": True,
            "stealth": True,
            "ignoreSslErrors": False,
            "ignoreCorsAndCsp": False,
            "maxRequestRetries": 3,  # Increased retry attempts
            "maxRequestsPerCrawl": 0,  # No limit (will crawl main + contact pages)
            "maxConcurrency": 1,  # No parallel requests
            "pageLoadTimeoutSecs": 90,  # Optimized timeout
            "pageFunctionTimeoutSecs": 180,  # 3 minutes for pageFunction
            "waitUntil": ["networkidle2"],  # Wait for network to be idle
            # OPTIMIZED: Residential proxies with recommended rotation
            "proxyConfiguration": {
                "useApifyProxy": True,
                "apifyProxyGroups": ["RESIDENTIAL"],  # Residential IPs less likely to be blocked
            },
            "proxyRotation": "RECOMMENDED",  # Optimal proxy rotation strategy
        }

        # --- Run the Apify scraper (NOW USING PUPPETEER-SCRAPER) ---
        try:
            print(f"  üì° Starting Apify puppeteer-scraper...")
            run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
            # Wait for dataset to be ready
            time.sleep(3)
            
            dataset = client.dataset(run["defaultDatasetId"])
            results = list(dataset.iterate_items())
            contact_results = [r for r in results if r and (r.get("emails") or r.get("phones"))]

            if contact_results:
                scraped = contact_results[0]
                updated_df.at[i, "Emails"] = scraped.get("emails", None)
                updated_df.at[i, "Phones"] = scraped.get("phones", None)
                updated_df.at[i, "Contact_Page"] = scraped.get("contactUrl", None)
                print(f"  ‚úÖ Found: {scraped.get('phones', [])} / {scraped.get('emails', [])}")
            else:
                print("  ‚ö†Ô∏è No contact data found.")

        except Exception as e:
            print(f"  ‚ùå Error scraping {website}: {e}")
        
        # Add delay to avoid rate limiting
        time.sleep(5)

    return updated_df


# --- Run the scraper for valid websites ---
result_df = await enrich_with_contact_info(result_df)

# --- Display updated results ---
display(result_df)


### Facebook Scrapping

In [None]:
# Initialize the ApifyClient with your API token
client = ApifyClient("apify_api_yNR85etaHpLtBzPoVozVVXUsCZe54u2Ffog1")

# Function to validate Singapore phone numbers (MUST have country code)
def validate_singapore_number(phone):
    if not phone:
        return None
    
    # Remove all spaces, dashes, parentheses
    cleaned = re.sub(r'[\s\-\(\)]', '', str(phone))
    
    # MUST have country code: +65XXXXXXXX or 65XXXXXXXX
    # First digit after country code must be 6, 8, or 9
    # Total of 8 digits after country code
    if re.match(r'^\+?65[689]\d{7}$', cleaned):
        return phone  # Return original format
    
    # Not a valid Singapore number with country code
    return None

# Prepare the Actor input
run_input = {
    "pages": [
        "https://www.facebook.com/KPECTHub/",
    ],
    "language": "en-US",
}

# Run the Actor and wait for it to finish
run = client.actor("oJ48ceKNY7ueGPGL0").call(run_input=run_input)

# Collect results
results = []
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    # Extract phone from multiple possible fields
    raw_phone = item.get('phone', None) or item.get('wa_number', None)
    
    # Validate it's a Singapore number WITH country code
    phone = validate_singapore_number(raw_phone)
    
    # Extract email
    email = item.get('email', None)
    
    # Extract website from the websites list (take first non-Google Maps link if available)
    websites = item.get('websites', [])
    website = None
    if websites:
        # Filter out Google Maps links and take the first real website
        real_websites = [w for w in websites if 'maps.google.com' not in w]
        website = real_websites[0] if real_websites else websites[0]
    
    results.append({
        'facebook_url': item.get('facebookUrl', None),
        'page_name': item.get('pageName', None),
        'phone': phone,  # Only Singapore numbers WITH country code or None
        'email': email,
        'website': website,
        'address': item.get('address', None)
    })

# Create DataFrame
df = pd.DataFrame(results)

df