In [19]:
import requests
import pandas as pd
import numpy as np
import glob
import os
import re
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI


### Getting Master DB Data

In [20]:

# --- CONFIG ---
file_path = "./Master DB/Master_DB_oct22.xlsx"

# --- HELPER FUNCTIONS ---
def clean_uen(u: str) -> str | None:
    if pd.isna(u):
        return None
    return re.sub(r"[^A-Z0-9]", "", str(u).upper().strip())

def clean_text(text: str) -> str | None:
    if pd.isna(text):
        return None
    text = str(text).strip().upper()
    return None if text == "NAN" else text

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Convert all column names to uppercase, replace non-alphanumeric with single underscore, remove trailing underscores."""
    new_cols = []
    for col in df.columns:
        col_std = re.sub(r"[^A-Z0-9]", "_", col.upper().strip())
        col_std = re.sub(r"_+", "_", col_std)  # Replace multiple underscores with single
        col_std = col_std.strip("_")  # Remove leading/trailing underscores
        new_cols.append(col_std)
    df.columns = new_cols
    return df

# --- LOAD DATA ---
master_db_df = pd.read_excel(file_path)

# --- SELECT RELEVANT COLUMNS ---
columns_to_keep = [
    "Company Registration Number (UEN)",
    "ACRA REGISTERED NAME",
    "Brand/Deal Name/Business Name",
    "Primary SSIC Code",
    "PIC NAME 1 Contact Number",
    "PIC 1 email address",
    "Website URL",
    "Parent Industry Type",
    "Sub Industry"
]
master_db_df = master_db_df[columns_to_keep].copy()

# --- STANDARDIZE COLUMN NAMES ---
master_db_df = standardize_columns(master_db_df)

# --- CLEANING & RENAME SPECIFIC COLUMNS ---
# Dynamically find the UEN column (first column containing 'UEN')
uen_col = [c for c in master_db_df.columns if "UEN" in c][0]
master_db_df["UEN"] = master_db_df[uen_col].apply(clean_uen)
master_db_df = master_db_df.drop(columns=[uen_col])

# Rename other columns consistently
rename_map = {
    "BRAND_DEAL_NAME_BUSINESS_NAME": "BRAND_NAME",
    "PRIMARY_SSIC_CODE": "SSIC_CODE",
    "ACRA_REGISTERED_NAME": "ACRA_REGISTERED_NAME"
}
master_db_df = master_db_df.rename(columns={k: v for k, v in rename_map.items() if k in master_db_df.columns})

# Clean text columns
for col in ["ACRA_REGISTERED_NAME", "BRAND_NAME"]:
    if col in master_db_df.columns:
        master_db_df[col] = master_db_df[col].apply(clean_text)

# Convert SSIC_CODE to integer if exists
if "SSIC_CODE" in master_db_df.columns:
    master_db_df["SSIC_CODE"] = master_db_df["SSIC_CODE"].astype("Int64")

# Keep only required columns if they exist
required_cols = ["UEN", "ACRA_REGISTERED_NAME", "BRAND_NAME", "SSIC_CODE"]
master_db_df = master_db_df[[c for c in required_cols if c in master_db_df.columns]]

# Filter out rows with missing or empty UEN
master_db_df = master_db_df[master_db_df["UEN"].notna() & (master_db_df["UEN"].str.strip() != "")]

master_db_df


Unnamed: 0,UEN,ACRA_REGISTERED_NAME,BRAND_NAME,SSIC_CODE
0,04799400B,AIK BEE TEXTILE CO,AIK BEE TEXTILE CO,46411
1,03376200K,SERANGOON GARDEN CLINIC AND DISPENSARY,GARDEN CLINIC,550263
2,06239600E,SALON DE BENZIMEN,SALON DE BENZIMEN,96021
3,06952000C,SU LAN LADIES FASHION,SU LAN LADIES FASHION,14103
4,10381600C,SIN HAI PRINTING SERVICE,SIN HAI PRINTING SERVICE,18113
...,...,...,...,...
7444,201734006N,MISTER MOBILE HOUGANG PTE. LTD.,MISTER MOBILE (HOUGANG),95120
7445,202210879W,MISTER MOBILE CHINATOWN PTE. LTD.,MISTER MOBILE (CHINATOWN),47411
7446,202205507G,MISTER MOBILE PTE. LTD.,MISTER MOBILE HQ,64202
7454,53473046M,BLOONIES,BLOONIES,47742


### Getting ACRA Data (Filter by Live, Live Company only & non relevant ssic code)

In [None]:
# -------------------------------------------------------------
# Folder containing your CSVs
# -------------------------------------------------------------
folder_path = "Acra_Data"

# Get all CSV file paths inside the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read and combine all CSVs
# Using low_memory=False to avoid DtypeWarning for mixed types
df = pd.concat((pd.read_csv(f, low_memory=False) for f in csv_files), ignore_index=True)

# -------------------------------------------------------------
# Convert all column names to uppercase
# -------------------------------------------------------------
df.columns = df.columns.str.upper()


# -------------------------------------------------------------
# Select relevant columns (now in uppercase)
# -------------------------------------------------------------
acra_data = df[[
    "UEN",
    "ENTITY_NAME",
    "BUSINESS_CONSTITUTION_DESCRIPTION",
    "ENTITY_TYPE_DESCRIPTION",
    "ENTITY_STATUS_DESCRIPTION",
    "REGISTRATION_INCORPORATION_DATE",
    "PRIMARY_SSIC_CODE",
    "STREET_NAME",
    "POSTAL_CODE"
]].copy()

# -------------------------------------------------------------
# Convert to proper data types
# -------------------------------------------------------------
acra_data['UEN'] = acra_data['UEN'].astype('string')
acra_data['ENTITY_NAME'] = acra_data['ENTITY_NAME'].astype('string')
acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'] = acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'].astype('string')
acra_data['ENTITY_TYPE_DESCRIPTION'] = acra_data['ENTITY_TYPE_DESCRIPTION'].astype('string')
acra_data['ENTITY_STATUS_DESCRIPTION'] = acra_data['ENTITY_STATUS_DESCRIPTION'].astype('string')
acra_data['REGISTRATION_INCORPORATION_DATE'] = pd.to_datetime(acra_data['REGISTRATION_INCORPORATION_DATE'], errors='coerce')

# -------------------------------------------------------------
# Clean string columns ‚Äî trim, remove extra spaces, uppercase
# -------------------------------------------------------------
for col in [
    'UEN',
    'ENTITY_NAME',
    'BUSINESS_CONSTITUTION_DESCRIPTION',
    'ENTITY_TYPE_DESCRIPTION',
    'ENTITY_STATUS_DESCRIPTION',
    'STREET_NAME',
    'POSTAL_CODE'
]:
    acra_data[col] = (
        acra_data[col]
        .fillna('')
        .str.strip()
        .str.replace(r'\s+', ' ', regex=True)
        .str.upper()
    )

# -------------------------------------------------------------
# Replace placeholders with NaN for standardization
# -------------------------------------------------------------
acra_data.replace(['NA', 'N/A', '-', ''], np.nan, inplace=True)

# -------------------------------------------------------------
# Convert registration date to dd-mm-yyyy string (optional)
# -------------------------------------------------------------
acra_data['REGISTRATION_INCORPORATION_DATE'] = acra_data['REGISTRATION_INCORPORATION_DATE'].dt.strftime('%d-%m-%Y')

# -------------------------------------------------------------
# Filter only live entities (LIVE COMPANY or LIVE)
# -------------------------------------------------------------
acra_data = acra_data[
    acra_data['ENTITY_STATUS_DESCRIPTION'].isin(['LIVE COMPANY', 'LIVE'])
].reset_index(drop=True)

# -------------------------------------------------------------
# Exclude specific PRIMARY_SSIC_CODE values (supposedly the data would be 600k plus but when we exclude this would lessen)
# -------------------------------------------------------------
exclude_codes = [
    46900, 47719, 47749, 47539, 47536, 56123,
    10711, 10712, 10719, 10732, 10733, 93209
]

acra_data = acra_data[~acra_data['PRIMARY_SSIC_CODE'].isin(exclude_codes)].reset_index(drop=True)

In [22]:
acra_data

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,ANG MO KIO INDUSTRIAL PARK 2A,568049
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,WOODLANDS STREET 12,738623
...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,KELANTAN LANE,200031
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,YISHUN INDUSTRIAL STREET 1,768161
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,SIN MING LANE,573969
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,JALAN BAHAGIA,320034


### Getting SSIC Industry code

In [23]:
# --- CONFIG ---
file_path = "./SSIC_Code/mapped_ssic_code.xlsx"

# --- LOAD DATA ---
mapped_ssic_code = pd.read_excel(file_path)

# --- STANDARDIZE COLUMN NAMES ---
# Uppercase, strip spaces, replace spaces with underscores
mapped_ssic_code.columns = (
    mapped_ssic_code.columns
    .str.strip()
    .str.upper()
    .str.replace(" ", "_")
)

# --- KEEP ONLY DESIRED COLUMNS ---
columns_to_keep = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "SSIC_CODES", "DESCRIPTION"]
mapped_ssic_code = mapped_ssic_code[columns_to_keep].copy()

# --- CLEAN SSIC_CODES COLUMN ---
mapped_ssic_code["SSIC_CODES"] = (
    pd.to_numeric(mapped_ssic_code["SSIC_CODES"], errors="coerce")  # safely convert to numeric
    .fillna(0)
    .astype(int)
)

# --- CLEAN TEXT COLUMNS ---
text_cols = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "DESCRIPTION"]
mapped_ssic_code[text_cols] = mapped_ssic_code[text_cols].apply(
    lambda col: col.astype(str).str.strip().str.title()
)

# --- REMOVE DUPLICATES & RESET INDEX ---
mapped_ssic_code = mapped_ssic_code.drop_duplicates().reset_index(drop=True)

mapped_ssic_code.head()


Unnamed: 0,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,SSIC_CODES,DESCRIPTION
0,Retail,Retail,Fashion & Apparel,47711,Retail Sale Of Clothing For Adults
1,Retail,Retail,Fashion & Apparel,47712,Retail Sale Of Children And Infants' Clothing
2,Retail,Retail,Fashion & Apparel,47715,Retail Sale Of Sewing And Clothing Accessories
3,Retail,Retail,Fashion & Apparel,47719,"Retail Sale Of Clothing, Footwear And Leather ..."
4,Retail,Retail,Fashion & Apparel,47510,Retail Sale Of Textiles


### Merge ACRA data with SSIC code

In [24]:
# Convert PRIMARY_SSIC_CODE to int
acra_data["PRIMARY_SSIC_CODE"] = (
    pd.to_numeric(acra_data["PRIMARY_SSIC_CODE"], errors="coerce")
    .fillna(0)
    .astype(int)
)

# Merge based on SSIC code
acra_data_filtered = acra_data.merge(
    mapped_ssic_code,
    how="left",
    left_on="PRIMARY_SSIC_CODE",
    right_on="SSIC_CODES"
)

# Optional: drop the duplicate 'SSIC CODES' column (keep only PRIMARY_SSIC_CODE)
acra_data_filtered = acra_data_filtered.drop(columns=["SSIC_CODES"], errors="ignore")


In [25]:
acra_data_filtered

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Livestock, Meat, Poultry, Eggs An..."
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,ANG MO KIO INDUSTRIAL PARK 2A,568049,Others,Manufacturing,Other Specialised Manufacturing & Distribution,Other Manufacturing Industries N.E.C.
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,WOODLANDS STREET 12,738623,Retail,Retail,Fashion & Apparel,Retail Sale Of Clothing For Adults
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,KELANTAN LANE,200031,Services,Services,Hair Salons & Barbershops,Hairdressing Salons/Shops
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,YISHUN INDUSTRIAL STREET 1,768161,Others,Built Environment & Infrastructure,Construction,Renovation Contractors
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,SIN MING LANE,573969,Others,"Finance, Legal & Real Estate","Legal, Accounting & Consultancy Activities",Management Consultancy Services N.E.C.
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,JALAN BAHAGIA,320034,Others,"Tourism, Agency",Travel Agencies & Tour Operators,Travel Agencies And Tour Operators (Mainly Out...


### FIlter Acra data with Master DB to get list of companies havent been researched  by MR

In [8]:

# Ensure both UEN columns are strings for accurate matching
acra_data_filtered['UEN'] = acra_data_filtered['UEN'].astype(str).str.strip().str.upper()
master_db_df['UEN'] = master_db_df['UEN'].astype(str).str.strip().str.upper()

# Filter out rows in acra_data_filtered whose UEN is already in master_db_df
acra_data_filtered = acra_data_filtered[~acra_data_filtered['UEN'].isin(master_db_df['UEN'])]

acra_data_filtered.shape

(533824, 13)

### Filter by  Industry (Wholesale)

In [9]:
# wholesale data
ssic_codes = [
    "46", "461", "4610", "46100", "462", "4621", "46211", "46212", "46213", "46219",
    "4622", "46221", "46222", "46223", "46224", "46225", "46229", "463", "4630", "46301",
    "46302", "46303", "46304", "46305", "46306", "46307", "46308", "46309", "464", "4641",
    "46411", "46412", "46413", "46414", "46415", "46416", "4642", "46421", "46422", "46423",
    "46424", "46429", "4643", "46431", "46432", "46433", "46434", "46435", "46436", "46439",
    "4644", "46441", "46442", "46443", "46444", "46445", "46449", "4645", "46451", "46452",
    "46453", "46459", "4646", "46461", "46462", "4647", "46471", "46472", "46473", "46474",
    "46479", "4649", "46491", "46492", "46499", "465", "4651", "46511", "46512", "46513",
    "46514", "4652", "46521", "46522", "46523", "4653", "46530", "4654", "46541", "46542",
    "46543", "46544", "46549", "4655", "46551", "46552", "46559", "4656", "46561", "46562",
    "46563", "4659", "46591", "46592", "46593", "46594", "46595", "46599", "466", "4661",
    "46610", "4662", "46620", "4663", "46631", "46632", "46633", "46634", "46635", "46639",
    "4664", "46641", "46642", "46643", "46649", "4665", "46651", "46659", "4666", "46661",
    "46662", "469", "4690", "46900"
]


acra_data_filtered_wholesale = acra_data_filtered[
    (
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live") |
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live company")
    )
    &
    (acra_data_filtered["PRIMARY_SSIC_CODE"].astype(str).isin(ssic_codes))
]


acra_data_filtered_wholesale

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,FISHERY PORT ROAD,619742,Others,Wholesale Trade,"Food, Beverages & Tobacco","Wholesale Of Livestock, Meat, Poultry, Eggs An..."
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,SIMS AVENUE,387509,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,JELLICOE ROAD,208767,Others,Wholesale Trade,Household Goods,Wholesale Of Textiles And Leathers
12,04129500E,AIK HOE & CO,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,23-01-1975,46551,KELANTAN ROAD,200028,Others,Wholesale Trade,"Machinery, Equipment & Supplies",Wholesale Of Marine Equipment And Accessories
14,04545400X,AIK HUAT AND COMPANY,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,17-01-1975,46441,KAKI BUKIT AVENUE 1,417943,Others,Wholesale Trade,Household Goods,Wholesale Of Sporting Goods And Equipment
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537268,T17LP0162L,ZYA HOLDINGS LIMITED PARTNERSHIP,,LIMITED PARTNERSHIP,LIVE,21-10-2017,46100,NATHAN ROAD,248728,Others,Wholesale Trade,Other Specialised Wholesale,Wholesale On A Fee Or Commission Basis
537298,T22LL0564C,ZEN ENGINEERING & TRADING LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,31-05-2022,46543,TOH GUAN ROAD EAST,608586,Others,Wholesale Trade,"Machinery, Equipment & Supplies","Wholesale Of Lifts, Escalators And Industrial ..."
537302,T23LL0056G,ZECRYNE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,13-01-2023,46301,BUKIT BATOK STREET 25,658881,Others,Wholesale Trade,"Food, Beverages & Tobacco",Wholesale Of Fruits And Vegetables
537313,T24LL0528K,ZOHMH LIMITED LIABILITY PARTNERSHIP,,LIMITED LIABILITY PARTNERSHIP,LIVE,07-05-2024,46303,WOODLANDS AVENUE 4,730844,Others,Wholesale Trade,"Food, Beverages & Tobacco",Wholesale Of A General Line Of Groceries


In [None]:
# recordowl_results = pd.read_excel("Fresh_Leads.xlsx")
# is_unique = recordowl_results['UEN'].is_unique
# print("Is UEN unique?:", is_unique)

# recordowl_results.shape


Is UEN unique?: True


(255, 10)

### Get RecordOwl Data

In [None]:
import pandas as pd

# --- Copy to avoid SettingWithCopyWarning ---
acra_data_filtered_wholesale = acra_data_filtered_wholesale.copy()

# --- Ensure the column is in datetime format ---
acra_data_filtered_wholesale["REGISTRATION_INCORPORATION_DATE"] = pd.to_datetime(
    acra_data_filtered_wholesale["REGISTRATION_INCORPORATION_DATE"], errors="coerce"
)

# --- Filter for companies registered after 2015-01-01 ---
filtered = acra_data_filtered_wholesale[
    acra_data_filtered_wholesale["REGISTRATION_INCORPORATION_DATE"] > "2015-01-01"
].copy()

# --- UPDATE HERE: Remove rows if UEN exists in recordowl_results.xlsx ---
recordowl_results = pd.read_excel("Fresh_Leads.xlsx")

# Standardize UEN column names (if needed)
recordowl_results.columns = recordowl_results.columns.str.upper()

# Ensure both dataframes have a 'UEN' column
if "UEN" in recordowl_results.columns and "UEN" in filtered.columns:
    filtered = filtered[~filtered["UEN"].isin(recordowl_results["UEN"])]
else:
    raise ValueError("Column 'UEN' not found in one of the dataframes.")

# --- Sample 50 rows from the filtered data ---
acra_data_filtered_wholesale_10 = filtered.sample(n=200, random_state=42).reset_index(drop=True)

acra_data_filtered_wholesale_10.head()


In [None]:
acra_data_filtered_wholesale_10.shape

In [None]:
Fresh_Leads = pd.read_excel("Fresh_Leads.xlsx")

result = Fresh_Leads["UEN"].isin(acra_data_filtered_wholesale_10["UEN"]).any()
print("Yes" if result else "No")

In [None]:
# import pandas as pd

# acra_data_filtered_wholesale_10 = pd.DataFrame({
#     "UEN": ["201903934W"]
# })


In [None]:
import time
import pandas as pd
from apify_client import ApifyClient
from bs4 import BeautifulSoup
import re
import json
from requests.exceptions import HTTPError, ConnectionError
from urllib3.exceptions import ProtocolError

client = ApifyClient("apify_api_70MQolLF1jPd03YWnerLin0VMSa5WO3YziN4")

SOCIAL_MEDIA_DOMAINS = [
    "facebook.com", "linkedin.com", "instagram.com", "youtube.com",
    "tiktok.com", "twitter.com", "x.com", "pinterest.com"
]

def fetch_dataset_items_safe(dataset_client, max_retries=5, initial_wait=3):
    """Safely fetch dataset items with multiple retry strategies."""
    dataset_items = []
    
    for attempt in range(max_retries):
        try:
            # Strategy 1: Try using iterate_items() (streaming)
            try:
                dataset_items = list(dataset_client.iterate_items())
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)  # Exponential backoff
                    print(f"  ‚ö†Ô∏è Iteration method failed (attempt {attempt + 1}/{max_retries}), trying direct fetch in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ö†Ô∏è Iteration method failed after all retries, trying direct fetch...")
            
            # Strategy 2: Try using list_items() (direct pagination)
            try:
                offset = 0
                limit = 100
                while True:
                    page = dataset_client.list_items(offset=offset, limit=limit, clean=True)
                    if not page.items:
                        break
                    dataset_items.extend(page.items)
                    if len(page.items) < limit:
                        break
                    offset += limit
                
                if dataset_items:
                    return dataset_items
            except (HTTPError, ConnectionError, ProtocolError, Exception) as e:
                if attempt < max_retries - 1:
                    wait_time = initial_wait * (2 ** attempt)
                    print(f"  ‚ö†Ô∏è Direct fetch failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"  ‚ùå All fetch methods failed: {e}")
                    return []
                    
        except Exception as e:
            if attempt < max_retries - 1:
                wait_time = initial_wait * (2 ** attempt)
                print(f"  ‚ö†Ô∏è Unexpected error (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"  ‚ùå Failed after all retries: {e}")
                return []
    
    return dataset_items

all_results = []

for idx, (i, row) in enumerate(acra_data_filtered_wholesale_10.iterrows(), 1):
    uen = str(row["UEN"]).strip()
    print(f"\nüîé Processing {uen} ({idx}/{len(acra_data_filtered_wholesale_10)})")

    # Build pageFunction with proper escaping
    page_function = f"""
    async function pageFunction(context) {{
        const {{ page, log, request }} = context;
        const uen = "{uen}";
        log.info("Visiting RecordOwl for UEN: " + uen);

        try {{
            await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ timeout: 30000 }});
            const input = await page.$("input[placeholder='Search company name, industry, or address']");
            await input.click({{ clickCount: 3 }});
            await input.type(uen, {{ delay: 100 }});

            await Promise.all([
                page.waitForNavigation({{ waitUntil: 'networkidle2', timeout: 60000 }}).catch(() => null),
                page.click("button[type='submit']")
            ]);

            // Wait for results with longer timeout
            try {{
                await page.waitForSelector("a[href*='/company/']", {{ timeout: 45000 }});
            }} catch (e) {{
                log.info("No company links found, might be not found");
                return {{ status: 'not_found', uen }};
            }}

            const companyLink = await page.$$eval("a[href*='/company/']", (links, uen) => {{
                for (const a of links) {{
                    const text = a.innerText || "";
                    const href = a.href || "";
                    if (text.includes(uen) || href.includes(uen.toLowerCase())) return a.href;
                }}
                return links.length > 0 ? links[0].href : null;
            }}, uen);

            if (!companyLink) return {{ status: 'not_found', uen }};

            if (page.url() !== companyLink) {{
                await page.goto(companyLink, {{ waitUntil: 'networkidle2', timeout: 60000 }});
            }}

            await new Promise(r => setTimeout(r, 3000)); // Increased wait time
            const html_content = await page.content();
            const title = await page.title();
            const url = page.url();

            return {{ status: 'success', uen, url, title, html_content }};
        }} catch (err) {{
            log.error("Error in pageFunction: " + err.message);
            return {{ status: 'error', uen, error: err.message }};
        }}
    }}
    """

    run_input = {
        "startUrls": [{"url": "https://recordowl.com/"}],
        "useChrome": True,
        "headless": True,
        "stealth": True,
        "pageFunction": page_function,
    }

    run = None
    try:
        # Start the run (same as original working code)
        print(f"  üì° Starting Apify run for {uen}...")
        run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)
        
        # Wait for the run to finish (poll the status)
        print(f"  ‚è≥ Waiting for run to complete...")
        run_client = client.run(run["id"])
        run_client.wait_for_finish()
    except Exception as e:
        print(f"  ‚ùå Apify call failed for {uen}: {e}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": f"Apify call failed: {str(e)}"
        })
        time.sleep(5)
        continue

    if not run or "defaultDatasetId" not in run:
        print(f"  ‚ö†Ô∏è No valid dataset returned for {uen}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": None,
            "Error": "No dataset returned"
        })
        continue

    # Wait for dataset to be ready
    print(f"  ‚è≥ Waiting for dataset to be ready...")
    time.sleep(3)
    
    scraped_html, record_owl_url = None, None
    
    # Fetch dataset items with improved error handling
    dataset_items = fetch_dataset_items_safe(
        client.dataset(run["defaultDatasetId"]),
        max_retries=5,
        initial_wait=3
    )
    
    # Process items
    for item in dataset_items:
        if item.get("status") == "success":
            scraped_html = item.get("html_content", "")
            record_owl_url = item.get("url")
            print(f"  ‚úÖ Successfully scraped {uen}")
        elif item.get("status") == "not_found":
            print(f"  ‚ö†Ô∏è Company not found for UEN {uen}")
        elif item.get("status") == "error":
            print(f"  ‚ùå Error for {uen}: {item.get('error')}")

    if not scraped_html:
        print(f"  ‚ö†Ô∏è No HTML content retrieved for {uen}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": "No HTML content retrieved"
        })
        time.sleep(5)
        continue

    # Parse HTML
    try:
        soup = BeautifulSoup(scraped_html, "html.parser")
        parent = soup.select_one("div.max-w-7xl.mx-auto.lg\\:py-6.sm\\:px-6.lg\\:px-8")

        emails, phones, website = [], [], None
        facebook_links, linkedin_links, instagram_links, tiktok_links = [], [], [], []

        if parent:
            # Extract emails
            for a in parent.select("a[href^=mailto]"):
                email = a.get("href", "").replace("mailto:", "").strip()
                if email and email not in emails and "@" in email:
                    emails.append(email)

            # ========== COMPREHENSIVE PHONE EXTRACTION ==========
            # This extracts Singapore phone numbers with ANY spacing/formatting:
            # - "65 63 19 2960" (spaces between digits)
            # - "6563192960" (no spaces)
            # - "+65-6319-2960" (dashes)
            # - "65 6 3 1 9 2 9 6 0" (space between every digit)
            # - "(65) 6319 2960" (with parentheses)
            # Method: Extract ALL digits first, then validate pattern
            print(f"  üîç Searching for phone numbers...")
            
            # Method 1: Look for tel: links (most reliable)
            tel_links = parent.select("a[href^='tel:'], a[href^='tel']")
            if tel_links:
                print(f"  üì± Found {len(tel_links)} tel: links")
            
            for a in tel_links:
                tel_href = a.get("href", "").replace("tel:", "").strip()
                tel_text = a.get_text(strip=True)
                print(f"  üìû Tel link - href: '{tel_href}', text: '{tel_text}'")
                
                # Extract all digits from tel link
                digits_only = re.sub(r"\D", "", tel_href)
                print(f"  üî¢ Tel digits: {digits_only}")
                
                # Handle different digit lengths
                if len(digits_only) == 10 and digits_only.startswith("65") and digits_only[2] in "689":
                    # 10 digits starting with 65 (e.g., "6563192960")
                    formatted = "+" + digits_only
                    if formatted not in phones:
                        phones.append(formatted)
                        print(f"  ‚úÖ Added from tel link (10 digits): {formatted}")
                elif len(digits_only) == 8 and digits_only[0] in "689":
                    # 8 digits starting with 6/8/9 (e.g., "63192960")
                    formatted = "+65" + digits_only
                    if formatted not in phones:
                        phones.append(formatted)
                        print(f"  ‚úÖ Added from tel link (8 digits): {formatted}")
                elif len(digits_only) > 10:
                    # More than 10 digits, try to find valid pattern
                    print(f"  üîç Searching within {len(digits_only)} digits for valid pattern...")
                    found = False
                    # Look for 65 followed by 6/8/9
                    for i in range(len(digits_only) - 9):
                        if digits_only[i:i+2] == "65" and digits_only[i+2] in "689":
                            formatted = "+" + digits_only[i:i+10]
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from tel link (extracted): {formatted}")
                            found = True
                            break
                    if not found:
                        # Try last 8 digits if they start with 6/8/9
                        if digits_only[-8] in "689":
                            formatted = "+65" + digits_only[-8:]
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from tel link (last 8 digits): {formatted}")
            
            # Method 2: Look in dt/dd structure with broader keywords
            dt_tags = parent.select("dt")
            if dt_tags:
                print(f"  üìã Found {len(dt_tags)} dt tags")
            
            for dt in dt_tags:
                dt_text = dt.get_text(strip=True).lower()
                # Check for phone-related keywords but exclude non-phone fields
                exclude_keywords = ["officer", "charge", "employee", "shareholder", "director", "registration"]
                phone_keywords = ["contact number", "phone", "tel", "mobile", "call", "contact no"]
                
                is_phone_field = any(kw in dt_text for kw in phone_keywords)
                is_excluded = any(excl in dt_text for excl in exclude_keywords)
                
                if is_phone_field and not is_excluded:
                    dd = dt.find_next_sibling("dd")
                    if dd:
                        number_text = dd.get_text(" ", strip=True)
                        print(f"  üìù Field '{dt_text}': {number_text}")
                        
                        # Extract all digits and check if it forms a valid phone number
                        all_digits = re.sub(r"\D", "", number_text)
                        print(f"  üî¢ Extracted digits: {all_digits}")
                        
                        # Check for Singapore phone patterns in the digits
                        # Pattern 1: 10 digits starting with 65
                        if len(all_digits) == 10 and all_digits.startswith("65") and all_digits[2] in "689":
                            formatted = "+" + all_digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from dt/dd (10 digits): {formatted}")
                        # Pattern 2: 8 digits starting with 6, 8, or 9
                        elif len(all_digits) == 8 and all_digits[0] in "689":
                            formatted = "+65" + all_digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from dt/dd (8 digits): {formatted}")
                        # Pattern 3: More than 10 digits, try to extract 10-digit number starting with 65
                        elif len(all_digits) > 10:
                            # Look for 65 followed by 6/8/9 in the digit string
                            for i in range(len(all_digits) - 9):
                                if all_digits[i:i+2] == "65" and all_digits[i+2] in "689":
                                    potential_number = all_digits[i:i+10]
                                    formatted = "+" + potential_number
                                    if formatted not in phones:
                                        phones.append(formatted)
                                        print(f"  ‚úÖ Added from dt/dd (extracted): {formatted}")
                                    break
            
            # Method 3: Search entire parent for phone patterns if none found
            if not phones:
                print(f"  üîé No phones found yet, searching entire content...")
                full_text = parent.get_text()
                
                # Ultra-comprehensive patterns to catch ALL spacing variations
                # These patterns allow unlimited spaces/dashes between digits
                patterns = [
                    # Pattern 1: +65 with any spacing (e.g., "+65 6 3 1 9 2 9 6 0", "+65-6319-2960")
                    r"\+[\s\-]*65[\s\-]+[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d",
                    # Pattern 2: (65) with any spacing
                    r"\([\s\-]*65[\s\-]*\)[\s\-]*[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d",
                    # Pattern 3: 65 without + or () but with space/dash (e.g., "65 6 3 1 9 2 9 6 0", "65-6319-2960")
                    r"(?<!\d)65[\s\-]+[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d(?!\d)",
                    # Pattern 4: Just 8 digits starting with 6/8/9 with any spacing
                    r"(?<!\d)[689][\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d[\s\-]*\d(?!\d)",
                ]
                
                for pattern_idx, pattern in enumerate(patterns, 1):
                    matches = re.findall(pattern, full_text)
                    if matches:
                        print(f"  üîç Pattern {pattern_idx} found {len(matches)} potential matches")
                    
                    for match in matches:
                        # Extract only digits
                        digits = re.sub(r"\D", "", match)
                        print(f"  üî¢ Pattern {pattern_idx} match: '{match.strip()}' ‚Üí digits: '{digits}'")
                        
                        # Validate and format
                        if len(digits) == 10 and digits.startswith("65") and digits[2] in "689":
                            formatted = "+" + digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from pattern {pattern_idx} (10 digits): {formatted}")
                        elif len(digits) == 8 and digits[0] in "689":
                            formatted = "+65" + digits
                            if formatted not in phones:
                                phones.append(formatted)
                                print(f"  ‚úÖ Added from pattern {pattern_idx} (8 digits): {formatted}")
                        elif len(digits) > 10:
                            # Try to find a valid 10-digit number within
                            for i in range(len(digits) - 9):
                                if digits[i:i+2] == "65" and digits[i+2] in "689":
                                    potential = digits[i:i+10]
                                    formatted = "+" + potential
                                    if formatted not in phones:
                                        phones.append(formatted)
                                        print(f"  ‚úÖ Added from pattern {pattern_idx} (extracted): {formatted}")
                                    break
            
            if phones:
                print(f"  ‚úÖ Total phones found: {phones}")
            else:
                print(f"  ‚ö†Ô∏è WARNING: No phone numbers found for {uen}")
                print(f"  üìÑ Showing first 500 chars of parent HTML for debugging:")
                print(parent.prettify()[:500] + "...")
            # ========== END PHONE EXTRACTION ==========

            # Extract website
            valid_websites = []
            for a in parent.select("a[href^=http]"):
                href = a.get("href", "").strip()
                href_lower = href.lower()
                if not any(domain in href_lower for domain in SOCIAL_MEDIA_DOMAINS):
                    if not any(skip in href_lower for skip in ["recordowl", "apify.com"]):
                        if any(tld in href for tld in [".com", ".sg", ".net", ".org", ".co"]):
                            valid_websites.append(href)
            website = valid_websites[0] if valid_websites else None

        # Extract social media links from entire page
        for a in soup.find_all("a", href=True):
            href = a["href"].strip().lower()
            if "facebook.com" in href and href not in facebook_links:
                facebook_links.append(href)
            elif "linkedin.com" in href and href not in linkedin_links:
                linkedin_links.append(href)
            elif "instagram.com" in href and href not in instagram_links:
                instagram_links.append(href)
            elif "tiktok.com" in href and href not in tiktok_links:
                tiktok_links.append(href)

        all_results.append({
            "UEN": uen,
            "Emails": emails if emails else None,
            "Phones": phones if phones else None,
            "Website": website,
            "Facebook": list(set(facebook_links)) if facebook_links else None,
            "LinkedIn": list(set(linkedin_links)) if linkedin_links else None,
            "Instagram": list(set(instagram_links)) if instagram_links else None,
            "TikTok": list(set(tiktok_links)) if tiktok_links else None,
            "RecordOwl_Link": record_owl_url,
        })
        print(f"  ‚úÖ Processed {uen}: {len(emails) if emails else 0} emails, {len(phones) if phones else 0} phones")
        
    except Exception as e:
        print(f"  ‚ùå Error parsing HTML for {uen}: {e}")
        all_results.append({
            "UEN": uen,
            "Emails": None,
            "Phones": None,
            "Website": None,
            "Facebook": None,
            "LinkedIn": None,
            "Instagram": None,
            "TikTok": None,
            "RecordOwl_Link": record_owl_url or None,
            "Error": f"HTML parsing error: {str(e)}"
        })

    # Dynamic sleep time to avoid rate limiting
    sleep_time = 10 + (idx % 5)  # 10-14 seconds
    print(f"  üí§ Sleeping for {sleep_time}s before next request...")
    time.sleep(sleep_time)

Fresh_Leads = pd.DataFrame(all_results)
print("\n‚úÖ Scraping complete!")
print(f"\nüìä Results summary:")
print(f"   Total processed: {len(Fresh_Leads)}")
print(f"   With emails: {Fresh_Leads['Emails'].notna().sum()}")
print(f"   With phones: {Fresh_Leads['Phones'].notna().sum()}")
print(f"   With websites: {Fresh_Leads['Website'].notna().sum()}")
Fresh_Leads.head(10)

In [None]:
Fresh_Leads

In [None]:
Fresh_Leads

In [None]:
Fresh_Leads_with_phones = Fresh_Leads[Fresh_Leads["Phones"].notna()]

In [None]:
Fresh_Leads_with_phones

In [None]:

# Load both Excel files
file_path_1 = "Fresh_Leads.xlsx"
Fresh_Leads = pd.read_excel(file_path_1)

# file_path_2 = "recordowl_results_4.xlsx"
# recordowl_results_4 = pd.read_excel(file_path_2)

# Append (combine) them
combined_df = pd.concat([Fresh_Leads, Fresh_Leads_with_phones], ignore_index=True)

# Optional: Save to a new Excel file
combined_df.to_excel("Fresh_Leads_New.xlsx", index=False)

# Preview
combined_df


In [None]:
count_non_nan = combined_df['Phones'].notna().sum()
print(count_non_nan)


### Web Scrapping

In [None]:
# from apify_client import ApifyClient

# client = ApifyClient("apify_api_0HQ8fc5fw5T1aosdacxKQNQYVBAEwi3tXaJc")

# uen = "202333361N"
# scraped_html = None  # Will hold the HTML content

# run_input = {
#     "startUrls": [{"url": "https://recordowl.com/"}],
#     "useChrome": True,
#     "headless": True,
#     "stealth": True,
#     "pageFunction": f"""
#     async function pageFunction(context) {{
#         const {{ page, log, request }} = context;
#         const uen = "{uen}";
#         log.info("üåê Visiting " + request.url);

#         try {{
#             // Wait for search input and type UEN
#             await page.waitForSelector("input[placeholder='Search company name, industry, or address']", {{ timeout: 30000 }});
#             const input = await page.$("input[placeholder='Search company name, industry, or address']");
#             await input.click({{ clickCount: 3 }});
#             await input.type(uen, {{ delay: 80 }});
#             log.info("‚úÖ Typed UEN: " + uen);

#             // Click search button and wait for navigation or results
#             const [response] = await Promise.all([
#                 page.waitForNavigation({{ waitUntil: "networkidle2", timeout: 60000 }}).catch(() => null), // in case no full navigation
#                 page.click("button[type='submit']"),
#             ]);
#             log.info("üîç Clicked search button");

#             // Wait for company link(s) to appear
#             await page.waitForSelector("a[href*='/company/']", {{ timeout: 40000 }});

#             // Get the first matching company link
#             const companyLink = await page.$$eval("a[href*='/company/']", (links, uen) => {{
#                 for (const a of links) {{
#                     if (a.innerText.includes(uen) || a.href.includes(uen)) return a.href;
#                 }}
#                 return links.length > 0 ? links[0].href : null;
#             }}, uen);

#             if (!companyLink) return {{ status: "not_found", uen }};

#             // Navigate to the company page (if different)
#             if (page.url() !== companyLink) {{
#                 await page.goto(companyLink, {{ waitUntil: "networkidle2", timeout: 60000 }});
#             }}

#             // Small wait to ensure content is fully loaded
#             await new Promise(resolve => setTimeout(resolve, 2000));

#             // Extract HTML
#             const html_content = await page.content();
#             const title = await page.title();
#             const url = page.url();

#             return {{ status: "success", uen, url, title, html_content }};
#         }} catch (err) {{
#             log.error("‚ùå Error: " + err.message);
#             return {{ status: "error", uen, error: err.message }};
#         }}
#     }}
#     """
# }

# # Run the scraper
# run = client.actor("apify/puppeteer-scraper").call(run_input=run_input)

# # Store HTML into variable
# for item in client.dataset(run["defaultDatasetId"]).iterate_items():
#     if item.get("status") == "success":
#         scraped_html = item.get("html_content", "")
#         record_owl_url = item.get("url")  
        
#         print("‚úÖ Scraped successfully")
#         print("Title:", item.get("title"))
#         print("RecordOwl URL:", record_owl_url)
#     elif item.get("status") == "not_found":
#         print(f"‚ö†Ô∏è Company not found for UEN {uen}")
#     else:
#         print("‚ùå Error:", item.get("error"))

In [None]:
# import pandas as pd
# from bs4 import BeautifulSoup
# import re

# # scraped_html and record_owl_url come from the Apify scraping code above
# # record_owl_url = item.get("url") from the Apify result

# # --- PARSE HTML ---
# soup = BeautifulSoup(scraped_html, "html.parser")

# # --- TARGET MAIN CONTAINER ---
# parent = soup.select_one("div.max-w-7xl.mx-auto.lg\\:py-6.sm\\:px-6.lg\\:px-8")

# emails, phones, website = [], [], None
# facebook_links, linkedin_links, instagram_links, tiktok_links = [], [], [], []
# number_of_employees = None

# # -------------------------------------------------------------
# # Extract Contact Info (Emails, Phones, Website)
# # -------------------------------------------------------------
# if parent:
#     for a in parent.select("a[href^=mailto]"):
#         email = a.get("href").replace("mailto:", "").strip()
#         if email not in emails:
#             emails.append(email)

#     for dt in parent.select("dt"):
#         label = dt.get_text(strip=True).lower()
#         if "contact number" in label:
#             dd = dt.find_next_sibling("dd")
#             if dd:
#                 number_text = dd.get_text(" ", strip=True)
#                 phone_pattern = r"(?:\+65\s*|65)?(?:\d\s*){8,}"
#                 for match in re.findall(phone_pattern, number_text):
#                     p_clean = re.sub(r"\D", "", match)
#                     if p_clean.startswith("65") and not p_clean.startswith("+65"):
#                         p_clean = "+" + p_clean
#                     elif not p_clean.startswith("+65"):
#                         p_clean = "+65" + p_clean
#                     if p_clean not in phones:
#                         phones.append(p_clean)

#     for a in parent.select("a[href^=http]"):
#         href = a.get("href").strip()
#         if any(skip in href.lower() for skip in ["recordowl", "apify.com"]):
#             continue
#         if (".com" in href or ".sg" in href) and not href.startswith(("mailto:", "tel:")):
#             website = href
#             break

# # -------------------------------------------------------------
# # Extract Social Media Links
# # -------------------------------------------------------------
# for a in soup.find_all("a", href=True):
#     href = a["href"].strip().lower()
#     if "facebook.com" in href:
#         facebook_links.append(href)
#     elif "linkedin.com" in href:
#         linkedin_links.append(href)
#     elif "instagram.com" in href:
#         instagram_links.append(href)
#     elif "tiktok.com" in href:
#         tiktok_links.append(href)

# facebook_links = list(set(facebook_links))
# linkedin_links = list(set(linkedin_links))
# instagram_links = list(set(instagram_links))
# tiktok_links = list(set(tiktok_links))

# # -------------------------------------------------------------
# # Extract Number of Employees
# # -------------------------------------------------------------
# for li in soup.select("li"):
#     li_text = li.get_text(" ", strip=True).lower()
#     if "number of employees" in li_text:
#         p_tags = li.find_all("p")
#         for i, p in enumerate(p_tags):
#             if "new value" in p.get_text(strip=True).lower() and i + 1 < len(p_tags):
#                 number_of_employees = p_tags[i + 1].get_text(strip=True)
#                 break
#         if number_of_employees:
#             break

# # -------------------------------------------------------------
# # SAVE RESULTS TO DATAFRAME (use None for empty values)
# # -------------------------------------------------------------
# result_df = pd.DataFrame([{
#     "Emails": emails if emails else None,
#     "Phones": phones if phones else None,
#     "Website": website if website else None,
#     "Facebook": facebook_links if facebook_links else None,
#     "LinkedIn": linkedin_links if linkedin_links else None,
#     "Instagram": instagram_links if instagram_links else None,
#     "TikTok": tiktok_links if tiktok_links else None,
#     "Number_of_Employees": number_of_employees if number_of_employees else None,
#     "RecordOwl_Link": record_owl_url if record_owl_url else None,
# }])

# display(result_df)

In [None]:
# import httpx
# import asyncio

# # =====================================================
# # Validate Website (only if no phone number)
# # =====================================================
# async def check_url(url: str) -> bool:
#     """Return True if the URL is reachable (status < 400)."""
#     if not url:
#         return False
#     try:
#         async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
#             response = await client.head(url)
#             return response.status_code < 400
#     except Exception:
#         return False


# async def validate_if_needed(df):
#     """Validate websites only if phone number is missing."""
#     for i, row in df.iterrows():
#         url = row.get("Website")
#         phone = row.get("Phones")

#         # Skip validation if phone exists
#         if phone:
#             df.at[i, "Website_Valid"] = None
#             continue

#         # Validate website if no phone
#         if url:
#             is_valid = await check_url(url)
#             df.at[i, "Website_Valid"] = "valid" if is_valid else "invalid"
#         else:
#             df.at[i, "Website_Valid"] = "invalid"

#     return df


# # =====================================================
# # Run async validation safely inside Jupyter
# # =====================================================
# result_df = await validate_if_needed(result_df)

# # =====================================================
# # Final output
# # =====================================================
# display(result_df)


### If contact number is invalid, then webscrapped website to get contact number

In [None]:
# import asyncio
# import os
# from apify_client import ApifyClient

# # --- Initialize Apify client ---
# APIFY_TOKEN = os.getenv("APIFY_TOKEN", "apify_api_0HQ8fc5fw5T1aosdacxKQNQYVBAEwi3tXaJc")
# client = ApifyClient(APIFY_TOKEN)

# # --- Async wrapper so you can run in Jupyter ---
# async def enrich_with_contact_info(df):
#     """Scrape contact info for rows where Website_Valid == 'valid' and Phones is empty."""
#     updated_df = df.copy()

#     for i, row in df.iterrows():
#         website = row.get("Website")
#         status = row.get("Website_Valid")
#         phone = row.get("Phones")

#         if not website or status != "valid" or phone:
#             continue  # Skip invalid or already complete rows

#         print(f"üîç Scraping contact page for: {website}")

#         # --- Apify scraping run input ---
#         run_input = {
#             "startUrls": [{"url": website}],
#             "pageFunction": r"""
#                 async function pageFunction(context) {
#                     const $ = context.jQuery;
#                     const isContact = context.request.userData?.isContact || false;

#                     if (!isContact) {
#                         let contactUrl = null;
#                         $('a[href]').each((i, el) => {
#                             const href = $(el).attr('href').toLowerCase();
#                             if (href.includes('contact')) {
#                                 contactUrl = href.startsWith('http') ? href : window.location.origin + href;
#                                 return false;
#                             }
#                         });

#                         if (contactUrl) {
#                             await context.enqueueRequest({ url: contactUrl, userData: { isContact: true } });
#                             context.log.info(`Enqueued contact page: ${contactUrl}`);
#                         }
#                         return null;
#                     }

#                     function isVisible(el) {
#                         return el.offsetParent !== null;
#                     }

#                     let emails = $('a[href^="mailto"]').filter((i, el) => isVisible(el))
#                         .map((i, el) => $(el).attr('href').replace('mailto:', '').trim())
#                         .get();

#                     let phones = $('a[href^="tel"]').filter((i, el) => isVisible(el))
#                         .map((i, el) => $(el).attr('href').replace(/[^0-9]/g, ''))
#                         .get();

#                     emails = [...new Set(emails)];
#                     phones = [...new Set(phones)];

#                     return {
#                         contactUrl: context.request.url,
#                         emails: emails.length ? emails : [],
#                         phones: phones.length ? phones : []
#                     };
#                 }
#             """,
#             "injectJQuery": True,
#             "useChrome": True,
#             "headless": True,
#             "proxyConfiguration": {"useApifyProxy": True},
#         }

#         # --- Run the Apify scraper ---
#         try:
#             run = client.actor("moJRLRc85AitArpNN").call(run_input=run_input)
#             dataset = client.dataset(run["defaultDatasetId"])
#             results = list(dataset.iterate_items())
#             contact_results = [r for r in results if r and (r.get("emails") or r.get("phones"))]

#             if contact_results:
#                 scraped = contact_results[0]
#                 updated_df.at[i, "Emails"] = scraped.get("emails", None)
#                 updated_df.at[i, "Phones"] = scraped.get("phones", None)
#                 updated_df.at[i, "Contact_Page"] = scraped.get("contactUrl", None)
#                 print(f"‚úÖ Found: {scraped.get('phones', [])} / {scraped.get('emails', [])}")
#             else:
#                 print("‚ö†Ô∏è No contact data found.")

#         except Exception as e:
#             print(f"‚ùå Error scraping {website}: {e}")

#     return updated_df


# # --- Run the scraper for valid websites ---
# result_df = await enrich_with_contact_info(result_df)

# # --- Display updated results ---
# display(result_df)
