In [1]:
import requests
import pandas as pd
import numpy as np
import glob
import os
import re
import time
import pandas as pd
from apify_client import ApifyClient
from bs4 import BeautifulSoup
import re
import json
from requests.exceptions import HTTPError, ConnectionError
from urllib3.exceptions import ProtocolError
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI


### Getting Master DB Data

In [2]:

# --- CONFIG ---
file_path = "./Master DB/Master_DB_oct22.xlsx"

# --- HELPER FUNCTIONS ---
def clean_uen(u: str) -> str | None:
    if pd.isna(u):
        return None
    return re.sub(r"[^A-Z0-9]", "", str(u).upper().strip())

def clean_text(text: str) -> str | None:
    if pd.isna(text):
        return None
    text = str(text).strip().upper()
    return None if text == "NAN" else text

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Convert all column names to uppercase, replace non-alphanumeric with single underscore, remove trailing underscores."""
    new_cols = []
    for col in df.columns:
        col_std = re.sub(r"[^A-Z0-9]", "_", col.upper().strip())
        col_std = re.sub(r"_+", "_", col_std)  # Replace multiple underscores with single
        col_std = col_std.strip("_")  # Remove leading/trailing underscores
        new_cols.append(col_std)
    df.columns = new_cols
    return df

# --- LOAD DATA ---
master_db_df = pd.read_excel(file_path)

# --- SELECT RELEVANT COLUMNS ---
columns_to_keep = [
    "Company Registration Number (UEN)",
    "ACRA REGISTERED NAME",
    "Brand/Deal Name/Business Name",
    "Primary SSIC Code",
    "PIC NAME 1 Contact Number",
    "PIC 1 email address",
    "Website URL",
    "Parent Industry Type",
    "Sub Industry"
]
master_db_df = master_db_df[columns_to_keep].copy()

# --- STANDARDIZE COLUMN NAMES ---
master_db_df = standardize_columns(master_db_df)

# --- CLEANING & RENAME SPECIFIC COLUMNS ---
# Dynamically find the UEN column (first column containing 'UEN')
uen_col = [c for c in master_db_df.columns if "UEN" in c][0]
master_db_df["UEN"] = master_db_df[uen_col].apply(clean_uen)
master_db_df = master_db_df.drop(columns=[uen_col])

# Rename other columns consistently
rename_map = {
    "BRAND_DEAL_NAME_BUSINESS_NAME": "BRAND_NAME",
    "PRIMARY_SSIC_CODE": "SSIC_CODE",
    "ACRA_REGISTERED_NAME": "ACRA_REGISTERED_NAME"
}
master_db_df = master_db_df.rename(columns={k: v for k, v in rename_map.items() if k in master_db_df.columns})

# Clean text columns
for col in ["ACRA_REGISTERED_NAME", "BRAND_NAME"]:
    if col in master_db_df.columns:
        master_db_df[col] = master_db_df[col].apply(clean_text)

# Convert SSIC_CODE to integer if exists
if "SSIC_CODE" in master_db_df.columns:
    master_db_df["SSIC_CODE"] = master_db_df["SSIC_CODE"].astype("Int64")

# Keep only required columns if they exist
required_cols = ["UEN", "ACRA_REGISTERED_NAME", "BRAND_NAME", "SSIC_CODE"]
master_db_df = master_db_df[[c for c in required_cols if c in master_db_df.columns]]

# Filter out rows with missing or empty UEN
master_db_df = master_db_df[master_db_df["UEN"].notna() & (master_db_df["UEN"].str.strip() != "")]

master_db_df


Unnamed: 0,UEN,ACRA_REGISTERED_NAME,BRAND_NAME,SSIC_CODE
0,04799400B,AIK BEE TEXTILE CO,AIK BEE TEXTILE CO,46411
1,03376200K,SERANGOON GARDEN CLINIC AND DISPENSARY,GARDEN CLINIC,550263
2,06239600E,SALON DE BENZIMEN,SALON DE BENZIMEN,96021
3,06952000C,SU LAN LADIES FASHION,SU LAN LADIES FASHION,14103
4,10381600C,SIN HAI PRINTING SERVICE,SIN HAI PRINTING SERVICE,18113
...,...,...,...,...
7444,201734006N,MISTER MOBILE HOUGANG PTE. LTD.,MISTER MOBILE (HOUGANG),95120
7445,202210879W,MISTER MOBILE CHINATOWN PTE. LTD.,MISTER MOBILE (CHINATOWN),47411
7446,202205507G,MISTER MOBILE PTE. LTD.,MISTER MOBILE HQ,64202
7454,53473046M,BLOONIES,BLOONIES,47742


### Getting ACRA Data (Filter by Live, Live Company only & non relevant ssic code)

In [3]:
# -------------------------------------------------------------
# Folder containing your CSVs
# -------------------------------------------------------------
folder_path = "Acra_Data"

# Get all CSV file paths inside the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read and combine all CSVs
# Using low_memory=False to avoid DtypeWarning for mixed types
df = pd.concat((pd.read_csv(f, low_memory=False) for f in csv_files), ignore_index=True)

# -------------------------------------------------------------
# Convert all column names to uppercase
# -------------------------------------------------------------
df.columns = df.columns.str.upper()


# -------------------------------------------------------------
# Select relevant columns (now in uppercase)
# -------------------------------------------------------------
acra_data = df[[
    "UEN",
    "ENTITY_NAME",
    "BUSINESS_CONSTITUTION_DESCRIPTION",
    "ENTITY_TYPE_DESCRIPTION",
    "ENTITY_STATUS_DESCRIPTION",
    "REGISTRATION_INCORPORATION_DATE",
    "PRIMARY_SSIC_CODE",
    "STREET_NAME",
    "POSTAL_CODE"
]].copy()

# -------------------------------------------------------------
# Convert to proper data types
# -------------------------------------------------------------
acra_data['UEN'] = acra_data['UEN'].astype('string')
acra_data['ENTITY_NAME'] = acra_data['ENTITY_NAME'].astype('string')
acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'] = acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'].astype('string')
acra_data['ENTITY_TYPE_DESCRIPTION'] = acra_data['ENTITY_TYPE_DESCRIPTION'].astype('string')
acra_data['ENTITY_STATUS_DESCRIPTION'] = acra_data['ENTITY_STATUS_DESCRIPTION'].astype('string')
acra_data['REGISTRATION_INCORPORATION_DATE'] = pd.to_datetime(acra_data['REGISTRATION_INCORPORATION_DATE'], errors='coerce')

# -------------------------------------------------------------
# Clean string columns ‚Äî trim, remove extra spaces, uppercase
# -------------------------------------------------------------
for col in [
    'UEN',
    'ENTITY_NAME',
    'BUSINESS_CONSTITUTION_DESCRIPTION',
    'ENTITY_TYPE_DESCRIPTION',
    'ENTITY_STATUS_DESCRIPTION',
    'STREET_NAME',
    'POSTAL_CODE'
]:
    acra_data[col] = (
        acra_data[col]
        .fillna('')
        .str.strip()
        .str.replace(r'\s+', ' ', regex=True)
        .str.upper()
    )

# -------------------------------------------------------------
# Replace placeholders with NaN for standardization
# -------------------------------------------------------------
acra_data.replace(['NA', 'N/A', '-', ''], np.nan, inplace=True)

# -------------------------------------------------------------
# Convert registration date to dd-mm-yyyy string (optional)
# -------------------------------------------------------------
acra_data['REGISTRATION_INCORPORATION_DATE'] = acra_data['REGISTRATION_INCORPORATION_DATE'].dt.strftime('%d-%m-%Y')

# -------------------------------------------------------------
# Filter only live entities (LIVE COMPANY or LIVE)
# -------------------------------------------------------------
acra_data = acra_data[
    acra_data['ENTITY_STATUS_DESCRIPTION'].isin(['LIVE COMPANY', 'LIVE'])
].reset_index(drop=True)

# -------------------------------------------------------------
# Exclude specific PRIMARY_SSIC_CODE values (supposedly the data would be 600k plus but when we exclude this would lessen)
# -------------------------------------------------------------
exclude_codes = [
    46900, 47719, 47749, 47539, 47536, 56123,
    10711, 10712, 10719, 10732, 10733, 93209
]

acra_data = acra_data[~acra_data['PRIMARY_SSIC_CODE'].isin(exclude_codes)].reset_index(drop=True)

In [4]:
acra_data

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,ANG MO KIO INDUSTRIAL PARK 2A,568049
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,WOODLANDS STREET 12,738623
...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,KELANTAN LANE,200031
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,YISHUN INDUSTRIAL STREET 1,768161
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,SIN MING LANE,573969
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,JALAN BAHAGIA,320034


### Getting SSIC Industry code

In [5]:
# --- CONFIG ---
file_path = "./SSIC_Code/mapped_ssic_code.xlsx"

# --- LOAD DATA ---
mapped_ssic_code = pd.read_excel(file_path)

# --- STANDARDIZE COLUMN NAMES ---
# Uppercase, strip spaces, replace spaces with underscores
mapped_ssic_code.columns = (
    mapped_ssic_code.columns
    .str.strip()
    .str.upper()
    .str.replace(" ", "_")
)

# --- KEEP ONLY DESIRED COLUMNS ---
columns_to_keep = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "SSIC_CODES", "DESCRIPTION"]
mapped_ssic_code = mapped_ssic_code[columns_to_keep].copy()

# --- CLEAN SSIC_CODES COLUMN ---
mapped_ssic_code["SSIC_CODES"] = (
    pd.to_numeric(mapped_ssic_code["SSIC_CODES"], errors="coerce")  # safely convert to numeric
    .fillna(0)
    .astype(int)
)

# --- CLEAN TEXT COLUMNS ---
text_cols = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "DESCRIPTION"]
mapped_ssic_code[text_cols] = mapped_ssic_code[text_cols].apply(
    lambda col: col.astype(str).str.strip().str.title()
)

# --- REMOVE DUPLICATES & RESET INDEX ---
mapped_ssic_code = mapped_ssic_code.drop_duplicates().reset_index(drop=True)

mapped_ssic_code.head()


Unnamed: 0,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,SSIC_CODES,DESCRIPTION
0,Retail,Retail,Fashion & Apparel,47711,Retail Sale Of Clothing For Adults
1,Retail,Retail,Fashion & Apparel,47712,Retail Sale Of Children And Infants' Clothing
2,Retail,Retail,Fashion & Apparel,47715,Retail Sale Of Sewing And Clothing Accessories
3,Retail,Retail,Fashion & Apparel,47719,"Retail Sale Of Clothing, Footwear And Leather ..."
4,Retail,Retail,Fashion & Apparel,47510,Retail Sale Of Textiles


### Merge ACRA data with SSIC code

In [6]:
# Convert PRIMARY_SSIC_CODE to int
acra_data["PRIMARY_SSIC_CODE"] = (
    pd.to_numeric(acra_data["PRIMARY_SSIC_CODE"], errors="coerce")
    .fillna(0)
    .astype(int)
)

# Merge based on SSIC code
acra_data_filtered = acra_data.merge(
    mapped_ssic_code,
    how="left",
    left_on="PRIMARY_SSIC_CODE",
    right_on="SSIC_CODES"
)

# Optional: drop the duplicate 'SSIC CODES' column (keep only PRIMARY_SSIC_CODE)
acra_data_filtered = acra_data_filtered.drop(columns=["SSIC_CODES"], errors="ignore")


In [7]:
acra_data_filtered

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Livestock, Meat, Poultry, Eggs An..."
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,ANG MO KIO INDUSTRIAL PARK 2A,568049,Others,Manufacturing,Other Specialised Manufacturing & Distribution,Other Manufacturing Industries N.E.C.
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,WOODLANDS STREET 12,738623,Retail,Retail,Fashion & Apparel,Retail Sale Of Clothing For Adults
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,KELANTAN LANE,200031,Services,Services,Hair Salons & Barbershops,Hairdressing Salons/Shops
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,YISHUN INDUSTRIAL STREET 1,768161,Others,Built Environment & Infrastructure,Construction,Renovation Contractors
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,SIN MING LANE,573969,Others,"Finance, Legal & Real Estate","Legal, Accounting & Consultancy Activities",Management Consultancy Services N.E.C.
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,JALAN BAHAGIA,320034,Others,"Tourism, Agency",Travel Agencies & Tour Operators,Travel Agencies And Tour Operators (Mainly Out...


### FIlter Acra data with Master DB to get list of companies havent been researched  by MR

In [8]:

# Ensure both UEN columns are strings for accurate matching
acra_data_filtered['UEN'] = acra_data_filtered['UEN'].astype(str).str.strip().str.upper()
master_db_df['UEN'] = master_db_df['UEN'].astype(str).str.strip().str.upper()

# Filter out rows in acra_data_filtered whose UEN is already in master_db_df
acra_data_filtered = acra_data_filtered[~acra_data_filtered['UEN'].isin(master_db_df['UEN'])]

acra_data_filtered.shape

(533824, 13)

### Filter by  Industry

In [9]:
# wholesale data
ssic_codes = [
    "46", "461", "4610", "46100", "462", "4621", "46211", "46212", "46213", "46219",
    "4622", "46221", "46222", "46223", "46224", "46225", "46229", "463", "4630", "46301",
    "46302", "46303", "46304", "46305", "46306", "46307", "46308", "46309", "464", "4641",
    "46411", "46412", "46413", "46414", "46415", "46416", "4642", "46421", "46422", "46423",
    "46424", "46429", "4643", "46431", "46432", "46433", "46434", "46435", "46436", "46439",
    "4644", "46441", "46442", "46443", "46444", "46445", "46449", "4645", "46451", "46452",
    "46453", "46459", "4646", "46461", "46462", "4647", "46471", "46472", "46473", "46474",
    "46479", "4649", "46491", "46492", "46499", "465", "4651", "46511", "46512", "46513",
    "46514", "4652", "46521", "46522", "46523", "4653", "46530", "4654", "46541", "46542",
    "46543", "46544", "46549", "4655", "46551", "46552", "46559", "4656", "46561", "46562",
    "46563", "4659", "46591", "46592", "46593", "46594", "46595", "46599", "466", "4661",
    "46610", "4662", "46620", "4663", "46631", "46632", "46633", "46634", "46635", "46639",
    "4664", "46641", "46642", "46643", "46649", "4665", "46651", "46659", "4666", "46661",
    "46662", "469", "4690", "46900"
]


acra_data_filtered_by_industry = acra_data_filtered[
    (
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live") |
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live company")
    )
    &
    (acra_data_filtered["PRIMARY_SSIC_CODE"].astype(str).isin(ssic_codes))
]


acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Livestock, Meat, Poultry, Eggs An..."
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
12,04129500E,AIK HOE & CO,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,23-01-1975,46551,KELANTAN ROAD,200028,Others,Wholesale Trade,"Machinery, Equipment & Supplies",Wholesale Of Marine Equipment And Accessories
14,04545400X,AIK HUAT AND COMPANY,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,17-01-1975,46441,KAKI BUKIT AVENUE 1,417943,Others,Wholesale Trade,Household Goods,Wholesale Of Sporting Goods And Equipment
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537268,T17LP0162L,ZYA HOLDINGS LIMITED PARTNERSHIP,,LIMITED PARTNERSHIP,LIVE,21-10-2017,46100,NATHAN ROAD,248728,Others,Wholesale Trade,Other Specialised Wholesale,Wholesale On A Fee Or Commission Basis
537298,T22LL0564C,ZEN ENGINEERING & TRADING LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,31-05-2022,46543,TOH GUAN ROAD EAST,608586,Others,Wholesale Trade,"Machinery, Equipment & Supplies","Wholesale Of Lifts, Escalators And Industrial ..."
537302,T23LL0056G,ZECRYNE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,13-01-2023,46301,BUKIT BATOK STREET 25,658881,Others,Wholesale Trade,"Food, Beverages & Tobacco",Wholesale Of Fruits And Vegetables
537313,T24LL0528K,ZOHMH LIMITED LIABILITY PARTNERSHIP,,LIMITED LIABILITY PARTNERSHIP,LIVE,07-05-2024,46303,WOODLANDS AVENUE 4,730844,Others,Wholesale Trade,"Food, Beverages & Tobacco",Wholesale Of A General Line Of Groceries


### Filter with Fresh Leads

In [10]:
# --- Copy to avoid SettingWithCopyWarning ---
acra_data_filtered_wholesale = acra_data_filtered_by_industry.copy()

# --- UPDATE HERE: Remove rows if UEN exists in recordowl_results.xlsx ---
Fresh_Leads_Wholesale = pd.read_excel("Fresh_Leads_Wholesale.xlsx")


if "UEN" in Fresh_Leads_Wholesale.columns and "UEN" in acra_data_filtered_wholesale.columns:
    filtered = acra_data_filtered_wholesale[~acra_data_filtered_wholesale["UEN"].isin(Fresh_Leads_Wholesale["UEN"])]
else:
    raise ValueError("Column 'UEN' not found in one of the dataframes.")

# sample data 
acra_data_filtered_wholesale = filtered.sample(n=50, random_state=42).reset_index(drop=True)

acra_data_filtered_wholesale.head(10)


Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,53480073D,HUMBLE BREWS,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,26-01-2024,46223,TOH YI DRIVE,590006,Others,Wholesale Trade,Agricultural Raw Materials & Live Animals,"Wholesale Of Coffee, Cocoa And Tea"
1,202303828W,WINE & BUBBLES PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,02-02-2023,46307,STURDEE ROAD,207855,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Liquor, Soft Drinks And Beverages"
2,202542730M,NUVIAA PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,24-09-2025,46413,YISHUN INDUSTRIAL STREET 1,768162,Others,Wholesale Trade,Household Goods,Wholesale Of Children And Infants' Clothing
3,201828332D,DE MAJESTIC VINES PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,17-08-2018,46307,ANSON ROAD,79903,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Liquor, Soft Drinks And Beverages"
4,201813214E,CARDE DESIGN PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,18-04-2018,46431,UPPER CROSS STREET,58357,Others,Wholesale Trade,Household Goods,Wholesale Of Furniture
5,199400661M,AIRPORT EQUIPMENT SERVICES PTE LTD,,LOCAL COMPANY,LIVE COMPANY,28-01-1994,46552,UBI CRESCENT,408564,Others,Wholesale Trade,"Machinery, Equipment & Supplies",Wholesale Of Aircraft Equipment And Supplies
6,201838013D,MITA MEDTECH PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,08-11-2018,46592,ORCHARD BOULEVARD,248649,Others,Wholesale Trade,"Machinery, Equipment & Supplies","Wholesale Of Medical, Professional, Scientific..."
7,53162832M,WAHANA DISTRIBUTOR,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-03-2010,46593,BUKIT BATOK CRESCENT,658065,Others,Wholesale Trade,"Machinery, Equipment & Supplies",Wholesale Of Commercial Food Service Equipment
8,202540737R,ALTIVEX HOLDINGS PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,11-09-2025,46634,YISHUN INDUSTRIAL STREET 1,768162,Others,Wholesale Trade,Other Specialised Wholesale,Wholesale Of Paints
9,200301636R,NPRIME INTERNATIONAL PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,25-02-2003,46599,KALLANG AVENUE,339416,Others,Wholesale Trade,"Machinery, Equipment & Supplies",Wholesale Of Other Machinery And Equipment N.E.C.


In [11]:

# acra_data_filtered_wholesale = pd.DataFrame({
#     "UEN": ["201625008K"]
# })

In [12]:
Process_data_RecordOwl_df = acra_data_filtered_wholesale.copy()

### Get Data from RecordOwl 

In [None]:

client = ApifyClient("apify_api_ZCE4JkWSigwKnhksXuw2Cf6V30zTpK1kXyk2")

SOCIAL_MEDIA_DOMAINS = [
    "facebook.com", "linkedin.com", "instagram.com", "youtube.com",
    "tiktok.com", "twitter.com", "x.com", "pinterest.com"
]

def fetch_dataset_items_safe(dataset_client, max_retries=5, initial_wait=3):
    """Safely fetch dataset items with multiple retry strategies."""
    dataset_items = []
    
    for attempt in range(max_retries):
        try:
            # Strategy 1: Try using iterate_items() (streaming)
            try:
                dataset_items = list(dataset_client.iterate_items())
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)  # Exponential backoff
                    print(f"  ‚ö†Ô∏è Iteration method failed (attempt {attempt + 1}/{max_retries}), trying direct fetch in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ö†Ô∏è Iteration method failed after all retries, trying direct fetch...")
            
            # Strategy 2: Try using list_items() (direct pagination)
            try:
                offset = 0
                limit = 100
                while True:
                    page = dataset_client.list_items(offset=offset, limit=limit, clean=True)
                    if not page.items:
                        break
                    dataset_items.extend(page.items)
                    if len(page.items) < limit:
                        break
                    offset += limit
                
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)
                    print(f"  ‚ö†Ô∏è Direct fetch failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ùå All fetch methods failed: {e}")
                    return []
                    
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = initial_wait * (2 ** attempt)
                print(f"  ‚ö†Ô∏è Unexpected error (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"  ‚ùå Failed after all retries: {e}")
                return []
    
    return dataset_items

def run_apify_with_retry(client, run_input, uen, max_retries=3):
    """Run Apify with exponential backoff on 403 errors AND verify dataset has items."""
    for attempt in range(max_retries):
        try:
            print(f"  üì° Starting Apify run for {uen} (attempt {attempt + 1}/{max_retries})...")
            run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
            print(f"  ‚è≥ Waiting for run to complete...")
            run_client = client.run(run["id"])
            run_info = run_client.wait_for_finish()
            
            # CRITICAL FIX: Check if run actually scraped pages, not just if it "succeeded"
            if run_info and "status" in run_info:
                status = run_info.get("status")
                
                # Even if status is "SUCCEEDED", verify dataset actually has items
                if status == "SUCCEEDED" and "defaultDatasetId" in run:
                    # Quick check if dataset has any items
                    try:
                        dataset_check = client.dataset(run["defaultDatasetId"])
                        time.sleep(2)  # Brief wait for dataset to be ready
                        test_items = dataset_check.list_items(limit=1, clean=True)
                        
                        if test_items.items and len(test_items.items) > 0:
                            # Dataset has items - true success!
                            print(f"  ‚úÖ Run succeeded with data")
                            return run, None
                        else:
                            # Status says "SUCCEEDED" but dataset is EMPTY - this is a failure!
                            print(f"  ‚ö†Ô∏è Run completed but dataset is empty (likely 403 block)")
                            # Treat as 403 and retry
                            if attempt < max_retries - 1:
                                wait_time = 30 * (2 ** attempt)
                                print(f"  üîÑ Retrying in {wait_time}s...")
                                time.sleep(wait_time)
                                continue
                            else:
                                return None, "Dataset empty after all retries (403 blocking)"
                    except Exception as e:
                        print(f"  ‚ö†Ô∏è Could not verify dataset: {e}")
                        # If we can't check dataset, try to use the run anyway
                        return run, None
                
                elif status != "SUCCEEDED":
                    # Check error message for 403
                    error_msg = str(run_info)
                    if "403" in error_msg or "blocked" in error_msg.lower():
                        if attempt < max_retries - 1:
                            wait_time = 30 * (2 ** attempt)  # 30s, 60s, 120s
                            print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                            time.sleep(wait_time)
                            continue
            
            return run, None
            
        except Exception as e:
            error_str = str(e)
            if "403" in error_str or "blocked" in error_str.lower():
                if attempt < max_retries - 1:
                    wait_time = 30 * (2 ** attempt)
                    print(f"  üö´ Request blocked (403), waiting {wait_time}s before retry...")
                    time.sleep(wait_time)
                    continue
            return None, f"Apify call failed: {str(e)}"
    
    return None, "Max retries exceeded due to 403 blocking"

all_results = []

for idx, (i, row) in enumerate(Process_data_RecordOwl_df.iterrows(), 1):
    uen = str(row["UEN"]).strip()
    print(f"\nüîé Processing {uen} ({idx}/{len(Process_data_RecordOwl_df)})")

    # Build pageFunction with proper escaping and improved error handling
    page_function = f"""
    async function pageFunction(context) {{
        const {{ page, log, request }} = context;
        const uen = "{uen}";
        log.info("Visiting RecordOwl for UEN: " + uen);

        try {{
            // Step 1: Wait for search input
            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ timeout: 30000 }});
            log.info("Search input found");
            
            // Step 2: Type UEN into search box with error handling and navigation protection
            try {{
                // Wait for page to be stable (no navigation happening)
                log.info("Waiting for page to stabilize...");
                await new Promise(r => setTimeout(r, 2000)); // Wait for any auto-navigation to complete
                
                // Wait for input to be present and stable
                await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ 
                    timeout: 30000,
                    visible: true 
                }});
                
                // Re-find input right before typing (in case page navigated)
                let input = await page.$("input[placeholder='Search company name, industry, or address']");
                if (!input) {{
                    log.error("Input element not found after wait");
                    return {{ status: 'error', uen, error: 'Input element not found' }};
                }}
                
                // Clear and type with retry logic
                let typed = false;
                for (let attempt = 0; attempt < 3; attempt++) {{
                    try {{
                        // Re-find input on each attempt (in case context was destroyed)
                        input = await page.$("input[placeholder='Search company name, industry, or address']");
                        if (!input) {{
                            throw new Error("Input not found on attempt " + (attempt + 1));
                        }}
                        
                        // Click to focus
                        await input.click({{ clickCount: 3 }});
                        await new Promise(r => setTimeout(r, 300)); // Small delay after click
                        
                        // Clear input first
                        await page.evaluate((selector) => {{
                            const el = document.querySelector(selector);
                            if (el) el.value = '';
                        }}, "input[placeholder='Search company name, industry, or address']");
                        
                        // Type UEN
                        await input.type(uen, {{ delay: 100 }});
                        typed = true;
                        log.info("UEN typed successfully: " + uen);
                        break;
                    }} catch (typeErr) {{
                        if (typeErr.message.includes("Execution context was destroyed") || 
                            typeErr.message.includes("navigation")) {{
                            log.warn("Navigation occurred during typing (attempt " + (attempt + 1) + "/3), retrying...");
                            // Wait for page to stabilize after navigation
                            await new Promise(r => setTimeout(r, 2000));
                            // Re-wait for input
                            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ 
                                timeout: 10000,
                                visible: true 
                            }});
                            continue;
                        }} else {{
                            throw typeErr;
                        }}
                    }}
                }}
                
                if (!typed) {{
                    log.error("Failed to type UEN after all retries");
                    return {{ status: 'error', uen, error: 'Failed to type UEN after retries' }};
                }}
                
            }} catch (typeErr) {{
                log.error("Error typing UEN: " + typeErr.message);
                return {{ status: 'error', uen, error: 'Failed to type UEN: ' + typeErr.message }};
            }}

            // Step 3: Submit search with flexible waiting strategy
            try {{
                log.info("Clicking submit button...");
                
                // Click submit button first
                await page.click("button[type='submit']");
                log.info("Submit button clicked");
                
                // Wait for either navigation OR results to appear (more flexible)
                // Strategy: Wait for results to appear, with navigation as optional
                try {{
                    // Option 1: Wait for navigation (if it happens) - non-blocking
                    const navigationPromise = page.waitForNavigation({{ 
                        waitUntil: 'networkidle2', 
                        timeout: 30000 
                    }}).catch(() => {{
                        log.info("Navigation did not occur (may be client-side routing)");
                        return null;
                    }});
                    
                    // Option 2: Wait for results to appear (more reliable)
                    const resultsPromise = page.waitForSelector("a[href*='/company/']", {{ 
                        timeout: 60000 
                    }});
                    
                    // Wait for either navigation or results (whichever happens first)
                    await Promise.race([
                        navigationPromise,
                        resultsPromise
                    ]);
                    
                    // Give page time to stabilize
                    await new Promise(r => setTimeout(r, 2000));
                    log.info("Page stabilized after submit");
                    
                }} catch (waitErr) {{
                    // If both navigation and results wait failed, try one more time for results
                    log.warn("Initial wait failed, trying again for results: " + waitErr.message);
                    try {{
                        await page.waitForSelector("a[href*='/company/']", {{ timeout: 30000 }});
                        log.info("Results found on retry");
                    }} catch (retryErr) {{
                        log.info("No company links found after submit, might be not found");
                        return {{ status: 'not_found', uen }};
                    }}
                }}
                
            }} catch (navErr) {{
                log.error("Error during submit: " + navErr.message);
                // Don't fail immediately - try to check if results are already there
                try {{
                    const hasResults = await page.$("a[href*='/company/']");
                    if (hasResults) {{
                        log.info("Results found despite submit error");
                    }} else {{
                        return {{ status: 'error', uen, error: 'Submit failed: ' + navErr.message }};
                    }}
                }} catch (checkErr) {{
                    return {{ status: 'error', uen, error: 'Submit failed: ' + navErr.message }};
                }}
            }}

            // Step 4: Verify search results are present
            log.info("Verifying company links are present...");
            try {{
                // Double-check that results are actually there
                await page.waitForSelector("a[href*='/company/']", {{ timeout: 10000 }});
                log.info("Company links confirmed");
            }} catch (e) {{
                log.info("No company links found, might be not found");
                return {{ status: 'not_found', uen }};
            }}

            // Step 5: Find the correct company link (in a new execution context after navigation)
            let companyLink;
            try {{
                companyLink = await page.evaluate((searchUen) => {{
                    const links = Array.from(document.querySelectorAll("a[href*='/company/']"));
                    for (const a of links) {{
                        const text = a.innerText || "";
                        const href = a.href || "";
                        if (text.includes(searchUen) || href.includes(searchUen.toLowerCase())) {{
                            return a.href;
                        }}
                    }}
                    return links.length > 0 ? links[0].href : null;
                }}, uen);
                
                if (!companyLink) {{
                    log.info("No matching company link found");
                    return {{ status: 'not_found', uen }};
                }}
                log.info("Found company link: " + companyLink);
            }} catch (evalErr) {{
                log.error("Error finding company link: " + evalErr.message);
                return {{ status: 'error', uen, error: 'Failed to find company link: ' + evalErr.message }};
            }}

            // Step 6: Navigate to company page if not already there
            if (page.url() !== companyLink) {{
                try {{
                    log.info("Navigating to company page...");
                    await page.goto(companyLink, {{ 
                        waitUntil: 'networkidle2', 
                        timeout: 60000 
                    }});
                    log.info("Company page loaded");
                    
                    // Critical: Wait for page to fully stabilize
                    await new Promise(r => setTimeout(r, 5000));
                }} catch (gotoErr) {{
                    log.error("Error navigating to company page: " + gotoErr.message);
                    return {{ status: 'error', uen, error: 'Failed to load company page: ' + gotoErr.message }};
                }}
            }}

            // Step 7: Wait for content to load (with multiple fallback strategies)
            log.info("Waiting for page content...");
            try {{
                await Promise.race([
                    page.waitForSelector('dt', {{ timeout: 15000 }}),
                    page.waitForSelector('dl', {{ timeout: 15000 }}),
                    page.waitForSelector('.max-w-7xl', {{ timeout: 15000 }}),
                    new Promise(r => setTimeout(r, 10000)) // Fallback: just wait 10s
                ]);
                log.info("Content loaded");
            }} catch (contentErr) {{
                log.warn("Content wait timeout, but continuing: " + contentErr.message);
            }}
            
            // Additional stabilization wait
            await new Promise(r => setTimeout(r, 3000));
            
            // Step 8: Extract content (in stable context) - ONLY VISIBLE ELEMENTS
            let html_content, title, url;
            try {{
                // Get only the visible HTML content by removing hidden elements
                await page.evaluate(() => {{
                    // Remove all elements that are hidden from view
                    const allElements = document.querySelectorAll('*');
                    allElements.forEach(el => {{
                        const style = window.getComputedStyle(el);
                        // Mark hidden elements with a special attribute
                        if (style.display === 'none' || 
                            style.visibility === 'hidden' || 
                            style.opacity === '0' ||
                            el.hidden ||
                            el.hasAttribute('hidden')) {{
                            el.setAttribute('data-hidden-element', 'true');
                        }}
                    }});
                }});
                
                html_content = await page.content();
                title = await page.title();
                url = page.url();
                log.info("Successfully extracted HTML content (" + html_content.length + " chars)");
            }} catch (extractErr) {{
                log.error("Error extracting content: " + extractErr.message);
                return {{ status: 'error', uen, error: 'Failed to extract content: ' + extractErr.message }};
            }}

            return {{ status: 'success', uen, url, title, html_content }};
            
        }} catch (err) {{
            log.error("Unexpected error in pageFunction: " + err.message);
            log.error("Stack: " + err.stack);
            return {{ status: 'error', uen, error: err.message }};
        }}
    }}
    """

    run_input = {
        "startUrls": [{"url": "https://recordowl.com/"}],
        "useChrome": True,
        "headless": True,
        "stealth": True,
        "pageFunction": page_function,
        "ignoreSslErrors": False,
        "ignoreCorsAndCsp": False,
        "maxRequestRetries": 3,  # Increased retry attempts
        "maxRequestsPerCrawl": 1,  # One page per run
        "maxConcurrency": 1,  # No parallel requests
        "pageLoadTimeoutSecs": 90,  # Optimized timeout
        "pageFunctionTimeoutSecs": 180,  # 3 minutes for pageFunction
        "waitUntil": ["networkidle2"],  # Wait for network to be idle
        # OPTIMIZED: Residential proxies with recommended rotation
        "proxyConfiguration": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"],  # Residential IPs less likely to be blocked
        },
        "proxyRotation": "RECOMMENDED",  # Optimal proxy rotation strategy
    }

    # Use retry logic for 403 errors (5 attempts = more chances to recover)
    run, error = run_apify_with_retry(client, run_input, uen, max_retries=5)

    if error or not run:
        print(f"  ‚ùå Apify call failed for {uen}: {error}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": error or "No run returned"
        })
        time.sleep(10)  # Longer sleep after failure
        continue

    if not run or "defaultDatasetId" not in run:
        print(f"  ‚ö†Ô∏è No valid dataset returned for {uen}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": "No dataset returned"
        })
        continue

    # Wait for dataset to be ready with progressive checking
    print(f"  ‚è≥ Waiting for dataset to be ready...")
    time.sleep(5)  # Initial wait
    
    # Try to fetch dataset with progressive waits
    dataset_client = client.dataset(run["defaultDatasetId"])
    for check_attempt in range(3):
        try:
            # Quick check if dataset has items
            test_fetch = dataset_client.list_items(limit=1, clean=True)
            if test_fetch.items:
                break
        except:
            pass
        
        if check_attempt < 2:
            additional_wait = 3 * (check_attempt + 1)
            print(f"  ‚è≥ Dataset not ready, waiting {additional_wait}s more...")
            time.sleep(additional_wait)
    
    scraped_html, record_owl_url = None, None
    
    # Fetch dataset items with improved error handling
    dataset_items = fetch_dataset_items_safe(
        dataset_client,
        max_retries=5,
        initial_wait=5  # Increased from 3 to 5
    )
    
    # Process items
    if not dataset_items:
        print(f"  ‚ö†Ô∏è Dataset is empty - no items returned!")
    else:
        print(f"  üìä Dataset has {len(dataset_items)} item(s)")
    
    for item in dataset_items:
        if item.get("status") == "success":
            scraped_html = item.get("html_content", "")
            record_owl_url = item.get("url")
            if scraped_html:
                print(f"  ‚úÖ Successfully scraped {uen} ({len(scraped_html)} chars of HTML)")
            else:
                print(f"  ‚ö†Ô∏è Status is 'success' but html_content is empty for {uen}")
        elif item.get("status") == "not_found":
            print(f"  ‚ö†Ô∏è Company not found for UEN {uen}")
        elif item.get("status") == "error":
            print(f"  ‚ùå Error for {uen}: {item.get('error')}")
        else:
            print(f"  ‚ö†Ô∏è Unknown item status for {uen}: {item.get('status')}")
            print(f"  üìã Item keys: {list(item.keys())}")

    if not scraped_html:
        # Determine the specific reason for failure
        if not dataset_items:
            error_reason = "Dataset empty (likely 403 block at Apify level)"
            print(f"  ‚ùå {error_reason}")
        elif any(item.get("status") == "not_found" for item in dataset_items):
            error_reason = "Company not found on RecordOwl"
            print(f"  ‚ùå {error_reason}")
        elif any(item.get("status") == "error" for item in dataset_items):
            error_details = [item.get("error", "Unknown") for item in dataset_items if item.get("status") == "error"]
            error_reason = f"Scraping error: {error_details[0] if error_details else 'Unknown'}"
            print(f"  ‚ùå {error_reason}")
        else:
            error_reason = "No HTML content retrieved (unknown reason)"
            print(f"  ‚ö†Ô∏è {error_reason}")
            # Debug: show what's in dataset items
            if dataset_items:
                print(f"  üîç DEBUG - First item: {dataset_items[0]}")
        
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": error_reason
        })
        time.sleep(5)
        continue

    # Parse HTML
    try:
        soup = BeautifulSoup(scraped_html, "html.parser")
        
        # ========== REMOVE HIDDEN ELEMENTS ==========
        # Remove all elements marked as hidden (not visible on the actual page)
        hidden_elements = soup.find_all(attrs={"data-hidden-element": "true"})
        removed_count = len(hidden_elements)
        for elem in hidden_elements:
            elem.decompose()
        if removed_count > 0:
            print(f"  üóëÔ∏è Removed {removed_count} hidden elements from HTML")
        # ========== END REMOVE HIDDEN ELEMENTS ==========
        
        # ========== FIX: TARGET ONLY COMPANY OVERVIEW, EXCLUDE OFFICER DATA ==========
        # First, try to find the overview/company info tab specifically
        overview_tab = (
            soup.select_one("#overview") or 
            soup.select_one("[aria-labelledby*='overview']") or
            soup.select_one("div[role='tabpanel']")
        )
        
        if overview_tab:
            parent = overview_tab
            print(f"  ‚úÖ Targeting Overview tab only (excluding officer/director data)")
        else:
            # Fallback: Get main container but REMOVE officer/director/shareholder sections
            parent = soup.select_one("div.max-w-7xl.mx-auto.lg\\:py-6.sm\\:px-6.lg\\:px-8")
            if parent:
                # Remove sections that contain personal contact info
                for unwanted_section in parent.select(
                    "#officers, #shareholders, #appointments, "
                    "[id*='officer'], [id*='shareholder'], [id*='appointment'], "
                    ".officer-section, .shareholder-section"
                ):
                    unwanted_section.decompose()
                print(f"  üßπ Removed officer/shareholder/appointment sections from page")
        # ========== END FIX ==========
        
        # ========== REMOVE NON-VISIBLE CONTENT ==========
        # Remove script, style, and other non-visible elements from parent
        if parent:
            for unwanted in parent.select("script, style, noscript, [style*='display:none'], [style*='display: none']"):
                unwanted.decompose()
        # ========== END REMOVE NON-VISIBLE CONTENT ==========

        emails, phones, website = [], [], None
        facebook_links, linkedin_links, instagram_links, tiktok_links = [], [], [], []
        
        # Helper function to check if element is visible
        def is_element_visible(element):
            """Check if a BeautifulSoup element appears to be visible (not hidden)."""
            if element is None:
                return False
            # Check for hidden attribute
            if element.has_attr('data-hidden-element'):
                return False
            # Check for common hidden styles
            style = element.get('style', '')
            if any(hidden_style in style.lower() for hidden_style in ['display:none', 'display: none', 'visibility:hidden', 'visibility: hidden']):
                return False
            # Check for hidden/aria-hidden attributes
            if element.get('hidden') or element.get('aria-hidden') == 'true':
                return False
            return True

        if parent:
            # Extract emails
            for a in parent.select("a[href^=mailto]"):
                email = a.get("href", "").replace("mailto:", "").strip()
                if email and email not in emails and "@" in email:
                    emails.append(email)

            # ========== COMPREHENSIVE PHONE EXTRACTION ==========
            # This extracts Singapore phone numbers with ANY spacing/formatting:
            # - "65 63 19 2960" (spaces between digits)
            # - "6563192960" (no spaces)
            # - "+65-6319-2960" (dashes)
            # - "65 6 3 1 9 2 9 6 0" (space between every digit)
            # - "(65) 6319 2960" (with parentheses)
            # Method: Extract ALL digits first, then validate pattern
            print(f"  üîç Searching for phone numbers...")
            
            # Method 1: Look for tel: links (most reliable) - ONLY VISIBLE ONES
            tel_links = parent.select("a[href^='tel:'], a[href^='tel']")
            # Filter to only visible tel links
            visible_tel_links = [link for link in tel_links if is_element_visible(link)]
            if visible_tel_links:
                print(f"  üì± Found {len(visible_tel_links)} visible tel: links (filtered from {len(tel_links)} total)")
            
            for a in visible_tel_links:
                tel_href = a.get("href", "").replace("tel:", "").strip()
                tel_text = a.get_text(strip=True)
                print(f"  üìû Tel link - href: '{tel_href}', text: '{tel_text}'")
                
                # Extract all digits from tel link
                digits_only = re.sub(r"\D", "", tel_href)
                print(f"  üî¢ Tel digits: {digits_only}")
                
                # Handle different digit lengths
                if len(digits_only) == 10 and digits_only.startswith("65") and digits_only[2] in "689":
                    # 10 digits starting with 65 (e.g., "6563192960")
                    formatted = "+" + digits_only
                    if formatted not in phones:
                        phones.append(formatted)
                        print(f"  ‚úÖ Added from tel link (10 digits): {formatted}")
                elif len(digits_only) == 8 and digits_only[0] in "689":
                    # 8 digits starting with 6/8/9 (e.g., "63192960")
                    formatted = "+65" + digits_only
                    if formatted not in phones:
                        phones.append(formatted)
                        print(f"  ‚úÖ Added from tel link (8 digits): {formatted}")
                elif len(digits_only) > 10:
                    # More than 10 digits, try to find valid pattern
                    print(f"  üîç Searching within {len(digits_only)} digits for valid pattern...")
                    found = False
                    # Look for 65 followed by 6/8/9
                    for i in range(len(digits_only) - 9):
                        if digits_only[i:i+2] == "65" and digits_only[i+2] in "689":
                            formatted = "+" + digits_only[i:i+10]
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from tel link (extracted): {formatted}")
                            found = True
                            break
                    if not found:
                        # Try last 8 digits if they start with 6/8/9
                        if digits_only[-8] in "689":
                            formatted = "+65" + digits_only[-8:]
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from tel link (last 8 digits): {formatted}")
            
            # Method 2: Look in dt/dd structure with broader keywords - ONLY VISIBLE ONES
            dt_tags = parent.select("dt")
            # Filter to only visible dt tags
            visible_dt_tags = [dt for dt in dt_tags if is_element_visible(dt)]
            if visible_dt_tags:
                print(f"  üìã Found {len(visible_dt_tags)} visible dt tags (filtered from {len(dt_tags)} total)")
            
            for dt in visible_dt_tags:
                dt_text = dt.get_text(strip=True).lower()
                
                # ========== IMPROVED: Stricter filtering for company-level contacts ==========
                # Company-level keywords (preferred)
                company_contact_keywords = [
                    "company contact", "business contact", "office phone", 
                    "main phone", "business phone", "company phone"
                ]
                
                # General contact keywords (accepted if no personal identifiers)
                general_contact_keywords = ["contact number", "phone", "tel", "mobile", "call", "contact no"]
                
                # EXCLUDE personal contact fields
                exclude_keywords = [
                    "officer", "charge", "employee", "shareholder", "director", 
                    "registration", "person", "individual", "member", "partner",
                    "manager", "owner", "proprietor", "authorized", "representative",
                    "appointment", "designation", "name of", "appointed"
                ]
                
                # Check if this is a company-level contact
                is_company_contact = any(kw in dt_text for kw in company_contact_keywords)
                is_general_contact = any(kw in dt_text for kw in general_contact_keywords)
                is_excluded = any(excl in dt_text for excl in exclude_keywords)
                
                # Only extract if it's explicitly company contact OR general contact without exclusions
                if (is_company_contact or (is_general_contact and not is_excluded)):
                    dd = dt.find_next_sibling("dd")
                    # Check if dd is also visible
                    if dd and is_element_visible(dd):
                        number_text = dd.get_text(" ", strip=True)
                        
                        # Debug: Show where this phone is coming from
                        contact_type = "COMPANY" if is_company_contact else "GENERAL"
                        print(f"  üìù [{contact_type}] Field '{dt_text}': {number_text}")
                        
                        # Extract all digits and check if it forms a valid phone number
                        all_digits = re.sub(r"\D", "", number_text)
                        print(f"  üî¢ Extracted digits: {all_digits}")
                        
                        # Check for Singapore phone patterns in the digits
                        # Pattern 1: 10 digits starting with 65
                        if len(all_digits) == 10 and all_digits.startswith("65") and all_digits[2] in "689":
                            formatted = "+" + all_digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from dt/dd (10 digits): {formatted}")
                        # Pattern 2: 8 digits starting with 6, 8, or 9
                        elif len(all_digits) == 8 and all_digits[0] in "689":
                            formatted = "+65" + all_digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from dt/dd (8 digits): {formatted}")
                        # Pattern 3: More than 10 digits, try to extract 10-digit number starting with 65
                        elif len(all_digits) > 10:
                            # Look for 65 followed by 6/8/9 in the digit string
                            for i in range(len(all_digits) - 9):
                                if all_digits[i:i+2] == "65" and all_digits[i+2] in "689":
                                    potential_number = all_digits[i:i+10]
                                    formatted = "+" + potential_number
                                    if formatted not in phones:
                                        phones.append(formatted)
                                        print(f"  ‚úÖ Added from dt/dd (extracted): {formatted}")
                                    break
            
            # Method 3: Search entire parent for phone patterns if none found
            # Note: This only searches visible content since hidden elements were already removed
            if not phones:
                print(f"  üîé No phones found yet, searching entire visible content...")
                full_text = parent.get_text()
                
                # Ultra-comprehensive patterns to catch ALL spacing variations
                # These patterns allow unlimited spaces/dashes between digits
                patterns = [
                    # Pattern 1: +65 with any spacing (e.g., "+65 6 3 1 9 2 9 6 0", "+65-6319-2960")
                    r"\+[\s\-]*65[\s\-]+[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d",
                    # Pattern 2: (65) with any spacing
                    r"\([\s\-]*65[\s\-]*\)[\s\-]*[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d",
                    # Pattern 3: 65 without + or () but with space/dash (e.g., "65 6 3 1 9 2 9 6 0", "65-6319-2960")
                    r"(?<!\d)65[\s\-]+[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d(?!\d)",
                    # Pattern 4: Just 8 digits starting with 6/8/9 with any spacing
                    r"(?<!\d)[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d(?!\d)",
                ]
                
                for pattern_idx, pattern in enumerate(patterns, 1):
                    matches = re.findall(pattern, full_text)
                    if matches:
                        print(f"  üîç Pattern {pattern_idx} found {len(matches)} potential matches")
                    
                    for match in matches:
                        # Extract only digits
                        digits = re.sub(r"\D", "", match)
                        print(f"  üî¢ Pattern {pattern_idx} match: '{match.strip()}' ‚Üí digits: '{digits}'")
                        
                        # Validate and format
                        if len(digits) == 10 and digits.startswith("65") and digits[2] in "689":
                            formatted = "+" + digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from pattern {pattern_idx} (10 digits): {formatted}")
                        elif len(digits) == 8 and digits[0] in "689":
                            formatted = "+65" + digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from pattern {pattern_idx} (8 digits): {formatted}")
                        elif len(digits) > 10:
                            # Try to find a valid 10-digit number within
                            for i in range(len(digits) - 9):
                                if digits[i:i+2] == "65" and digits[i+2] in "689":
                                    potential = digits[i:i+10]
                                    formatted = "+" + potential
                                    if formatted not in phones:
                                        phones.append(formatted)
                                        print(f"  ‚úÖ Added from pattern {pattern_idx} (extracted): {formatted}")
                                    break
            
            if phones:
                print(f"  ‚úÖ Total phones found: {phones}")
            else:
                print(f"  ‚ö†Ô∏è WARNING: No phone numbers found for {uen}")
                print(f"  üìÑ Showing first 500 chars of parent HTML for debugging:")
                print(parent.prettify()[:500] + "...")
            # ========== END PHONE EXTRACTION ==========

            # Extract website
            valid_websites = []
            for a in parent.select("a[href^=http]"):
                href = a.get("href", "").strip()
                href_lower = href.lower()
                if not any(domain in href_lower for domain in SOCIAL_MEDIA_DOMAINS):
                    if not any(skip in href_lower for skip in ["recordowl", "apify.com"]):
                        if any(tld in href for tld in [".com", ".sg", ".net", ".org", ".co"]):
                            valid_websites.append(href)
            website = valid_websites[0] if valid_websites else None

        # Extract social media links from entire page
        for a in soup.find_all("a", href=True):
            href = a["href"].strip().lower()
            if "facebook.com" in href and href not in facebook_links:
                facebook_links.append(href)
            elif "linkedin.com" in href and href not in linkedin_links:
                linkedin_links.append(href)
            elif "instagram.com" in href and href not in instagram_links:
                instagram_links.append(href)
            elif "tiktok.com" in href and href not in tiktok_links:
                tiktok_links.append(href)

        all_results.append({
            "UEN": uen,
            "Emails": emails if emails else None,
            "Phones": phones if phones else None,
            "Website": website,
            "Facebook": list(set(facebook_links)) if facebook_links else None,
            "LinkedIn": list(set(linkedin_links)) if linkedin_links else None,
            "Instagram": list(set(instagram_links)) if instagram_links else None,
            "TikTok": list(set(tiktok_links)) if tiktok_links else None,
            "RecordOwl_Link": record_owl_url,
        })
        print(f"  ‚úÖ Processed {uen}: {len(emails) if emails else 0} emails, {len(phones) if phones else 0} phones")
        
    except Exception as e:
        print(f"  ‚ùå Error parsing HTML for {uen}: {e}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": f"HTML parsing error: {str(e)}"
        })

    # Dynamic sleep time to avoid rate limiting and 403 blocks
    # Longer delays reduce detection and blocking
    base_sleep = 20  # Increased from 10
    random_addition = (idx % 10) + 5  # 5-14 seconds random
    sleep_time = base_sleep + random_addition  # 25-34 seconds total

    print(f"  üí§ Sleeping for {sleep_time}s before next request...")
    time.sleep(sleep_time)

    # Extra delay after every 5th request to further avoid detection
    if idx % 5 == 0:
        extra_wait = 30
        print(f"  üõë Checkpoint pause: waiting extra {extra_wait}s...")
        time.sleep(extra_wait)

New_Fresh_Leads = pd.DataFrame(all_results)
print("\n‚úÖ Scraping complete!")
print(f"\nüìä Results summary:")
print(f"   Total processed: {len(New_Fresh_Leads)}")
print(f"   With emails: {New_Fresh_Leads['Emails'].notna().sum()}")
print(f"   With phones: {New_Fresh_Leads['Phones'].notna().sum()}")
print(f"   With websites: {New_Fresh_Leads['Website'].notna().sum()}")

New_Fresh_Leads.head(10)


üîé Processing 53480073D (1/50)
  üì° Starting Apify run for 53480073D (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:8fRbCIPD4GW9aeWcO][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:8fRbCIPD4GW9aeWcO][0m -> 2025-11-06T09:29:58.041Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:8fRbCIPD4GW9aeWcO][0m -> 2025-11-06T09:29:58.043Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:8fRbCIPD4GW9aeWcO][0m -> 2025-11-06T09:29:58.573Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:8fRbCIPD4GW9aeWcO][0m -> 2025-11-06T09:29:59.006Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:8fRbCIPD4GW9aeWcO][0m -> 2025-11-06T09:30:00.006Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:8fR

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 53480073D (133005 chars of HTML)
  üóëÔ∏è Removed 73 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 10 visible dt tags (filtered from 10 total)
  üîé No phones found yet, searching entire visible content...
  üìÑ Showing first 500 chars of parent HTML for debugging:
<div aria-labelledby="overview-tab" class="block" id="overview" role="tabpanel" style="height: auto !important;">
 <div class="bg-white shadow overflow-hidden sm:rounded-lg mb-6">
  <div class="px-4 py-5 sm:px-6 flex justify-between items-center">
   <div>
    <h2 class="text-lg leading-6 font-medium text-gray-900">
     General Information
    </h2>
    <p class="mt-1 max-w-2xl text-sm text-gray-500">
     Official company information and location
    </p>
   </d

[36m[apify.puppeteer-scraper runId:40ZhvxRbw1Tsb0crf][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:40ZhvxRbw1Tsb0crf][0m -> 2025-11-06T09:31:49.887Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:40ZhvxRbw1Tsb0crf][0m -> 2025-11-06T09:31:49.889Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:40ZhvxRbw1Tsb0crf][0m -> 2025-11-06T09:31:49.934Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:40ZhvxRbw1Tsb0crf][0m -> 2025-11-06T09:31:50.133Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:40ZhvxRbw1Tsb0crf][0m -> 2025-11-06T09:31:51.600Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:40Z

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 202303828W (1146793 chars of HTML)
  üóëÔ∏è Removed 72 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 11 visible dt tags (filtered from 11 total)
  üìù [GENERAL] Field 'contact number': +65 88162548
  üî¢ Extracted digits: 6588162548
  ‚úÖ Added from dt/dd (10 digits): +6588162548
  ‚úÖ Total phones found: ['+6588162548']
  ‚úÖ Processed 202303828W: 1 emails, 1 phones
  üí§ Sleeping for 27s before next request...

üîé Processing 202542730M (3/50)
  üì° Starting Apify run for 202542730M (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:SiEJvBKax92bURmtb][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:SiEJvBKax92bURmtb][0m -> 2025-11-06T09:33:32.204Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:SiEJvBKax92bURmtb][0m -> 2025-11-06T09:33:32.206Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:SiEJvBKax92bURmtb][0m -> 2025-11-06T09:33:32.275Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:SiEJvBKax92bURmtb][0m -> 2025-11-06T09:33:32.615Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:SiEJvBKax92bURmtb][0m -> 2025-11-06T09:33:33.338Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:SiE

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚ö†Ô∏è Company not found for UEN 202542730M
  ‚ùå Company not found on RecordOwl

üîé Processing 201828332D (4/50)
  üì° Starting Apify run for 201828332D (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:WDQ5kmraEFcpgT8V0][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:WDQ5kmraEFcpgT8V0][0m -> 2025-11-06T09:34:29.572Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:WDQ5kmraEFcpgT8V0][0m -> 2025-11-06T09:34:29.574Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:WDQ5kmraEFcpgT8V0][0m -> 2025-11-06T09:34:29.639Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:WDQ5kmraEFcpgT8V0][0m -> 2025-11-06T09:34:29.847Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:WDQ5kmraEFcpgT8V0][0m -> 2025-11-06T09:34:30.669Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:WDQ

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 201828332D (1488274 chars of HTML)
  üóëÔ∏è Removed 73 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 13 visible dt tags (filtered from 13 total)
  üìù [GENERAL] Field 'contact number': 6980 7200
  üî¢ Extracted digits: 69807200
  ‚úÖ Added from dt/dd (8 digits): +6569807200
  ‚úÖ Total phones found: ['+6569807200']
  ‚úÖ Processed 201828332D: 1 emails, 1 phones
  üí§ Sleeping for 29s before next request...

üîé Processing 201813214E (5/50)
  üì° Starting Apify run for 201813214E (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:iRhsZ8hdalK7LaTBL][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:iRhsZ8hdalK7LaTBL][0m -> 2025-11-06T09:36:40.616Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:iRhsZ8hdalK7LaTBL][0m -> 2025-11-06T09:36:40.618Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:iRhsZ8hdalK7LaTBL][0m -> 2025-11-06T09:36:40.671Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:iRhsZ8hdalK7LaTBL][0m -> 2025-11-06T09:36:40.929Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:iRhsZ8hdalK7LaTBL][0m -> 2025-11-06T09:36:41.989Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:iRh

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 201813214E (467882 chars of HTML)
  üóëÔ∏è Removed 71 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 10 visible dt tags (filtered from 10 total)
  üîé No phones found yet, searching entire visible content...
  üìÑ Showing first 500 chars of parent HTML for debugging:
<div aria-labelledby="overview-tab" class="block" id="overview" role="tabpanel" style="height: auto !important;">
 <div class="bg-white shadow overflow-hidden sm:rounded-lg mb-6">
  <div class="px-4 py-5 sm:px-6 flex justify-between items-center">
   <div>
    <h2 class="text-lg leading-6 font-medium text-gray-900">
     General Information
    </h2>
    <p class="mt-1 max-w-2xl text-sm text-gray-500">
     Official company information and location
    </p>
   </

[36m[apify.puppeteer-scraper runId:3Ec1KuFlV2WoPd2yH][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:3Ec1KuFlV2WoPd2yH][0m -> 2025-11-06T09:38:42.442Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:3Ec1KuFlV2WoPd2yH][0m -> 2025-11-06T09:38:42.449Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:3Ec1KuFlV2WoPd2yH][0m -> 2025-11-06T09:38:42.500Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:3Ec1KuFlV2WoPd2yH][0m -> 2025-11-06T09:38:42.722Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:3Ec1KuFlV2WoPd2yH][0m -> 2025-11-06T09:38:43.419Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:3Ec

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 199400661M (815312 chars of HTML)
  üóëÔ∏è Removed 71 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 12 visible dt tags (filtered from 12 total)
  üìù [GENERAL] Field 'contact number': (65) 6542 1160
  üî¢ Extracted digits: 6565421160
  ‚úÖ Added from dt/dd (10 digits): +6565421160
  ‚úÖ Total phones found: ['+6565421160']
  ‚úÖ Processed 199400661M: 0 emails, 1 phones
  üí§ Sleeping for 31s before next request...

üîé Processing 201838013D (7/50)
  üì° Starting Apify run for 201838013D (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:uvXGUoIgeYLjeaqpz][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:uvXGUoIgeYLjeaqpz][0m -> 2025-11-06T09:40:19.987Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:uvXGUoIgeYLjeaqpz][0m -> 2025-11-06T09:40:19.989Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:uvXGUoIgeYLjeaqpz][0m -> 2025-11-06T09:40:20.036Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:uvXGUoIgeYLjeaqpz][0m -> 2025-11-06T09:40:20.207Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:uvXGUoIgeYLjeaqpz][0m -> 2025-11-06T09:40:20.880Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:uvX

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 201838013D (490435 chars of HTML)
  üóëÔ∏è Removed 72 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 11 visible dt tags (filtered from 11 total)
  üîé No phones found yet, searching entire visible content...
  üìÑ Showing first 500 chars of parent HTML for debugging:
<div aria-labelledby="overview-tab" class="block" id="overview" role="tabpanel" style="height: auto !important;">
 <div class="bg-white shadow overflow-hidden sm:rounded-lg mb-6">
  <div class="px-4 py-5 sm:px-6 flex justify-between items-center">
   <div>
    <h2 class="text-lg leading-6 font-medium text-gray-900">
     General Information
    </h2>
    <p class="mt-1 max-w-2xl text-sm text-gray-500">
     Official company information and location
    </p>
   </

[36m[apify.puppeteer-scraper runId:8AhG5LxIM9GgpmwgP][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:8AhG5LxIM9GgpmwgP][0m -> 2025-11-06T09:42:03.081Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:8AhG5LxIM9GgpmwgP][0m -> 2025-11-06T09:42:03.082Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:8AhG5LxIM9GgpmwgP][0m -> 2025-11-06T09:42:03.125Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:8AhG5LxIM9GgpmwgP][0m -> 2025-11-06T09:42:03.290Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:8AhG5LxIM9GgpmwgP][0m -> 2025-11-06T09:42:03.904Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:8Ah

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚úÖ Successfully scraped 53162832M (128058 chars of HTML)
  üóëÔ∏è Removed 73 hidden elements from HTML
  ‚úÖ Targeting Overview tab only (excluding officer/director data)
  üîç Searching for phone numbers...
  üìã Found 8 visible dt tags (filtered from 8 total)
  üîé No phones found yet, searching entire visible content...
  üìÑ Showing first 500 chars of parent HTML for debugging:
<div aria-labelledby="overview-tab" class="block" id="overview" role="tabpanel" style="height: auto !important;">
 <div class="bg-white shadow overflow-hidden sm:rounded-lg mb-6">
  <div class="px-4 py-5 sm:px-6 flex justify-between items-center">
   <div>
    <h2 class="text-lg leading-6 font-medium text-gray-900">
     General Information
    </h2>
    <p class="mt-1 max-w-2xl text-sm text-gray-500">
     Official company information and location
    </p>
   </div

[36m[apify.puppeteer-scraper runId:ZKrjwPXyEVCcGIzLJ][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:ZKrjwPXyEVCcGIzLJ][0m -> 2025-11-06T09:44:38.500Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:ZKrjwPXyEVCcGIzLJ][0m -> 2025-11-06T09:44:38.507Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:ZKrjwPXyEVCcGIzLJ][0m -> 2025-11-06T09:44:38.569Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:ZKrjwPXyEVCcGIzLJ][0m -> 2025-11-06T09:44:38.764Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:ZKrjwPXyEVCcGIzLJ][0m -> 2025-11-06T09:44:39.705Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:ZKr

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚ö†Ô∏è Company not found for UEN 202540737R
  ‚ùå Company not found on RecordOwl

üîé Processing 200301636R (10/50)
  üì° Starting Apify run for 200301636R (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:MdfMoWPxpga5p0bzt][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:MdfMoWPxpga5p0bzt][0m -> 2025-11-06T09:45:44.123Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:MdfMoWPxpga5p0bzt][0m -> 2025-11-06T09:45:44.125Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:MdfMoWPxpga5p0bzt][0m -> 2025-11-06T09:45:44.204Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:MdfMoWPxpga5p0bzt][0m -> 2025-11-06T09:45:44.378Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:MdfMoWPxpga5p0bzt][0m -> 2025-11-06T09:45:45.010Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:Mdf

  ‚è≥ Waiting for run to complete...
  ‚úÖ Run succeeded with data
  ‚è≥ Waiting for dataset to be ready...
  üìä Dataset has 1 item(s)
  ‚ùå Error for 200301636R: Failed to load company page: Navigation timeout of 60000 ms exceeded
  ‚ùå Scraping error: Failed to load company page: Navigation timeout of 60000 ms exceeded

üîé Processing 201700187C (11/50)
  üì° Starting Apify run for 201700187C (attempt 1/5)...


[36m[apify.puppeteer-scraper runId:QfYOrqWcCN3G9bjPF][0m -> Status: RUNNING, Message: 
[36m[apify.puppeteer-scraper runId:QfYOrqWcCN3G9bjPF][0m -> 2025-11-06T09:49:25.917Z ACTOR: Pulling container image of build g6G5r98rF5fM6ecm3 from registry.
[36m[apify.puppeteer-scraper runId:QfYOrqWcCN3G9bjPF][0m -> 2025-11-06T09:49:25.918Z ACTOR: Creating container.
[36m[apify.puppeteer-scraper runId:QfYOrqWcCN3G9bjPF][0m -> 2025-11-06T09:49:25.989Z ACTOR: Starting container.
[36m[apify.puppeteer-scraper runId:QfYOrqWcCN3G9bjPF][0m -> 2025-11-06T09:49:26.146Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.puppeteer-scraper runId:QfYOrqWcCN3G9bjPF][0m -> 2025-11-06T09:49:26.762Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.4","apifyClientVersion":"2.16.0","crawleeVersion":"3.14.1","osType":"Linux","nodeVersion":"v22.19.0"}[39m
[36m[apify.puppeteer-scraper runId:QfY

In [None]:
New_Fresh_Leads

### Append and save into exel sheet

In [None]:

# # Load both Excel files
# file_path_1 = "Fresh_Leads.xlsx"
# Fresh_Leads = pd.read_excel(file_path_1)

# # file_path_2 = "recordowl_results_4.xlsx"
# # recordowl_results_4 = pd.read_excel(file_path_2)

# # Append (combine) them
# combined_df = pd.concat([Fresh_Leads, Fresh_Leads_with_phones], ignore_index=True)

# # Optional: Save to a new Excel file
# combined_df.to_excel("Fresh_Leads_New.xlsx", index=False)

# # Preview
# combined_df


In [None]:
# count_non_nan = combined_df['Phones'].notna().sum()
# print(count_non_nan)


### Website Scrapping

In [None]:
# import httpx
# import asyncio

# # =====================================================
# # Validate Website (only if no phone number)
# # =====================================================
# async def check_url(url: str) -> bool:
#     """Return True if the URL is reachable (status < 400)."""
#     if not url:
#         return False
#     try:
#         async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
#             response = await client.head(url)
#             return response.status_code < 400
#     except Exception:
#         return False


# async def validate_if_needed(df):
#     """Validate websites only if phone number is missing."""
#     for i, row in df.iterrows():
#         url = row.get("Website")
#         phone = row.get("Phones")

#         # Skip validation if phone exists
#         if phone:
#             df.at[i, "Website_Valid"] = None
#             continue

#         # Validate website if no phone
#         if url:
#             is_valid = await check_url(url)
#             df.at[i, "Website_Valid"] = "valid" if is_valid else "invalid"
#         else:
#             df.at[i, "Website_Valid"] = "invalid"

#     return df


# # =====================================================
# # Run async validation safely inside Jupyter
# # =====================================================
# result_df = await validate_if_needed(result_df)

# # =====================================================
# # Final output
# # =====================================================
# display(result_df)


### If contact number is invalid, then webscrapped website to get contact number

In [None]:
# import asyncio
# import os
# import time
# from apify_client import ApifyClient

# # --- Initialize Apify client ---
# APIFY_TOKEN = os.getenv("APIFY_TOKEN", "apify_api_0HQ8fc5fw5T1aosdacxKQNQYVBAEwi3tXaJc")
# client = ApifyClient(APIFY_TOKEN)

# # --- Async wrapper so you can run in Jupyter ---
# async def enrich_with_contact_info(df):
#     """Scrape contact info for rows where Website_Valid == 'valid' and Phones is empty."""
#     updated_df = df.copy()

#     for i, row in df.iterrows():
#         website = row.get("Website")
#         status = row.get("Website_Valid")
#         phone = row.get("Phones")

#         if not website or status != "valid" or phone:
#             continue  # Skip invalid or already complete rows

#         print(f"üîç Scraping contact page for: {website}")

#         # --- CONVERTED TO PUPPETEER-SCRAPER (same as Cell 20) ---
#         # Now using native Puppeteer syntax instead of jQuery
#         run_input = {
#             "startUrls": [{"url": website}],
#             "pageFunction": r"""
#                 async function pageFunction(context) {
#                     const { page, log, request } = context;
#                     const isContact = request.userData?.isContact || false;

#                     // If not on contact page yet, try to find and navigate to it
#                     if (!isContact) {
#                         try {
#                             // Wait for page to load
#                             await page.waitForSelector('a', { timeout: 10000 }).catch(() => null);
                            
#                             // Find contact page link using Puppeteer
#                             const contactUrl = await page.evaluate(() => {
#                                 const links = Array.from(document.querySelectorAll('a[href]'));
#                                 for (const link of links) {
#                                     const href = link.getAttribute('href');
#                                     if (href && href.toLowerCase().includes('contact')) {
#                                         return href.startsWith('http') ? href : window.location.origin + href;
#                                     }
#                                 }
#                                 return null;
#                             });

#                             if (contactUrl) {
#                                 await context.enqueueRequest({ 
#                                     url: contactUrl, 
#                                     userData: { isContact: true } 
#                                 });
#                                 log.info(`Enqueued contact page: ${contactUrl}`);
#                             }
#                             return null;
#                         } catch (err) {
#                             log.error(`Error finding contact page: ${err.message}`);
#                             return null;
#                         }
#                     }

#                     // We're on the contact page - extract emails and phones
#                     try {
#                         // Wait for content to load
#                         await new Promise(r => setTimeout(r, 3000));

#                         // Extract emails and phones using Puppeteer
#                         const contactData = await page.evaluate(() => {
#                             // Helper: check if element is visible
#                             function isVisible(el) {
#                                 return el && el.offsetParent !== null;
#                             }

#                             // Extract emails from mailto links
#                             const emailLinks = Array.from(document.querySelectorAll('a[href^="mailto"]'));
#                             const emails = emailLinks
#                                 .filter(el => isVisible(el))
#                                 .map(el => el.getAttribute('href').replace('mailto:', '').trim())
#                                 .filter(email => email.length > 0);

#                             // Extract phones from tel links
#                             const phoneLinks = Array.from(document.querySelectorAll('a[href^="tel"]'));
#                             const phones = phoneLinks
#                                 .filter(el => isVisible(el))
#                                 .map(el => el.getAttribute('href').replace(/[^0-9]/g, ''))
#                                 .filter(phone => phone.length > 0);

#                             return {
#                                 emails: [...new Set(emails)],
#                                 phones: [...new Set(phones)]
#                             };
#                         });

#                         return {
#                             contactUrl: request.url,
#                             emails: contactData.emails.length ? contactData.emails : [],
#                             phones: contactData.phones.length ? contactData.phones : []
#                         };
#                     } catch (err) {
#                         log.error(`Error extracting contact data: ${err.message}`);
#                         return {
#                             contactUrl: request.url,
#                             emails: [],
#                             phones: [],
#                             error: err.message
#                         };
#                     }
#                 }
#             """,
#             "useChrome": True,
#             "headless": True,
#             "stealth": True,
#             "ignoreSslErrors": False,
#             "ignoreCorsAndCsp": False,
#             "maxRequestRetries": 3,  # Increased retry attempts
#             "maxRequestsPerCrawl": 0,  # No limit (will crawl main + contact pages)
#             "maxConcurrency": 1,  # No parallel requests
#             "pageLoadTimeoutSecs": 90,  # Optimized timeout
#             "pageFunctionTimeoutSecs": 180,  # 3 minutes for pageFunction
#             "waitUntil": ["networkidle2"],  # Wait for network to be idle
#             # OPTIMIZED: Residential proxies with recommended rotation
#             "proxyConfiguration": {
#                 "useApifyProxy": True,
#                 "apifyProxyGroups": ["RESIDENTIAL"],  # Residential IPs less likely to be blocked
#             },
#             "proxyRotation": "RECOMMENDED",  # Optimal proxy rotation strategy
#         }

#         # --- Run the Apify scraper (NOW USING PUPPETEER-SCRAPER) ---
#         try:
#             print(f"  üì° Starting Apify puppeteer-scraper...")
#             run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
            
#             # Wait for dataset to be ready
#             time.sleep(3)
            
#             dataset = client.dataset(run["defaultDatasetId"])
#             results = list(dataset.iterate_items())
#             contact_results = [r for r in results if r and (r.get("emails") or r.get("phones"))]

#             if contact_results:
#                 scraped = contact_results[0]
#                 updated_df.at[i, "Emails"] = scraped.get("emails", None)
#                 updated_df.at[i, "Phones"] = scraped.get("phones", None)
#                 updated_df.at[i, "Contact_Page"] = scraped.get("contactUrl", None)
#                 print(f"  ‚úÖ Found: {scraped.get('phones', [])} / {scraped.get('emails', [])}")
#             else:
#                 print("  ‚ö†Ô∏è No contact data found.")

#         except Exception as e:
#             print(f"  ‚ùå Error scraping {website}: {e}")
        
#         # Add delay to avoid rate limiting
#         time.sleep(5)

#     return updated_df


# # --- Run the scraper for valid websites ---
# result_df = await enrich_with_contact_info(result_df)

# # --- Display updated results ---
# display(result_df)


### Facebook Scrapping

In [None]:
# # Initialize the ApifyClient with your API token
# client = ApifyClient("apify_api_yNR85etaHpLtBzPoVozVVXUsCZe54u2Ffog1")

# # Function to validate Singapore phone numbers (MUST have country code)
# def validate_singapore_number(phone):
#     if not phone:
#         return None
    
#     # Remove all spaces, dashes, parentheses
#     cleaned = re.sub(r'[\s\-\(\)]', '', str(phone))
    
#     # MUST have country code: +65XXXXXXXX or 65XXXXXXXX
#     # First digit after country code must be 6, 8, or 9
#     # Total of 8 digits after country code
#     if re.match(r'^\+?65[689]\d{7}$', cleaned):
#         return phone  # Return original format
    
#     # Not a valid Singapore number with country code
#     return None

# # Prepare the Actor input
# run_input = {
#     "pages": [
#         "https://www.facebook.com/KPECTHub/",
#     ],
#     "language": "en-US",
# }

# # Run the Actor and wait for it to finish
# run = client.actor("oJ48ceKNY7ueGPGL0").call(run_input=run_input)

# # Collect results
# results = []
# for item in client.dataset(run["defaultDatasetId"]).iterate_items():
#     # Extract phone from multiple possible fields
#     raw_phone = item.get('phone', None) or item.get('wa_number', None)
    
#     # Validate it's a Singapore number WITH country code
#     phone = validate_singapore_number(raw_phone)
    
#     # Extract email
#     email = item.get('email', None)
    
#     # Extract website from the websites list (take first non-Google Maps link if available)
#     websites = item.get('websites', [])
#     website = None
#     if websites:
#         # Filter out Google Maps links and take the first real website
#         real_websites = [w for w in websites if 'maps.google.com' not in w]
#         website = real_websites[0] if real_websites else websites[0]
    
#     results.append({
#         'facebook_url': item.get('facebookUrl', None),
#         'page_name': item.get('pageName', None),
#         'phone': phone,  # Only Singapore numbers WITH country code or None
#         'email': email,
#         'website': website,
#         'address': item.get('address', None)
#     })

# # Create DataFrame
# df = pd.DataFrame(results)

# df