### Data Ingestion (Bronze 1)

- Data ingestion from ACRA, MasterDB and SSIC mapping, this data will be merged and filter with selected industry to get the specific company havent been researched by MR.

In [26]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re


### Getting Master DB via Google API

In [27]:
# --- OPTIMIZED: Handle timeouts, fetch with retries, and increase timeout limits ---

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google.auth.transport.requests import AuthorizedSession
import google.auth.transport.requests
import os
import pandas as pd
import re
import socket
import time

sheet_id = '1ipwIl7fciIlddvOUqGLpNlVQufw7Xd26Qa-YuJcx-xE'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

SERVICE_ACCOUNT_FILE = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "credentials.json")
if not os.path.exists(SERVICE_ACCOUNT_FILE):
    raise FileNotFoundError(
        f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'. "
        "Set GOOGLE_APPLICATION_CREDENTIALS to the full path, or place credentials.json next to this notebook."
    )

# --- OPTIMIZATION: Configure credentials and build service with increased timeout ---
credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# Build service - the timeout is configured at the request level
service = build('sheets', 'v4', credentials=credentials, cache_discovery=False)
sheet = service.spreadsheets()

range_a1 = "'MASTER DATABASE 2025 Template'!A:ZZ"

# --- OPTIMIZATION: Fetch with retry logic and increased timeout ---
def fetch_sheet_data_with_retry(sheet_service, spreadsheet_id, range_name, max_retries=3):
    """Fetch sheet data with exponential backoff retry logic."""
    
    # Increase socket timeout globally for this operation
    old_timeout = socket.getdefaulttimeout()
    socket.setdefaulttimeout(300)  # 5 minutes
    
    try:
        for attempt in range(max_retries):
            try:
                print(f"Fetching data from Google Sheets (attempt {attempt + 1}/{max_retries})...")
                
                # Execute the request
                result = sheet_service.values().get(
                    spreadsheetId=spreadsheet_id,
                    range=range_name
                ).execute()
                
                print("✓ Data fetched successfully!")
                return result
                
            except (TimeoutError, socket.timeout, OSError, ConnectionError) as e:
                wait_time = (2 ** attempt) * 2  # Exponential backoff: 2s, 4s, 8s
                print(f"⚠ Timeout error on attempt {attempt + 1}: {type(e).__name__}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print("✗ Max retries reached.")
                    print("\nTroubleshooting suggestions:")
                    print("1. Check your internet connection")
                    print("2. The sheet might be too large - consider splitting the range")
                    print("3. Try again later if Google Sheets API is experiencing issues")
                    print("4. Consider fetching only required columns instead of A:ZZ")
                    raise
            except HttpError as e:
                print(f"✗ HTTP Error: {e}")
                raise
            except Exception as e:
                print(f"✗ Unexpected error: {type(e).__name__}: {e}")
                raise
    finally:
        # Restore original timeout
        socket.setdefaulttimeout(old_timeout)

# Fetch data with retry mechanism
result = fetch_sheet_data_with_retry(sheet, sheet_id, range_a1)

# --- CLEAN: safe extraction ---
values = result.get('values', [])

if values:
    header = values[0]
    data_rows = values[1:]

    max_len = max([len(header)] + [len(r) for r in data_rows]) if data_rows else len(header)

    if len(header) < max_len:
        header = header + [f'col_{i+1}' for i in range(len(header), max_len)]

    normalized_rows = [row + [''] * (max_len - len(row)) for row in data_rows]

    Master_DB_df = pd.DataFrame(normalized_rows, columns=header)
    print(f"Successfully loaded {len(Master_DB_df)} rows and {len(Master_DB_df.columns)} columns")
else:
    Master_DB_df = pd.DataFrame()
    print("No data found in the sheet")

# ---------------- CLEANING FUNCTIONS ----------------

def clean_ssic_code(value):
    if pd.isna(value) or value == '':
        return None
    try:
        cleaned = re.sub(r"[^0-9]", "", str(value).strip())
        return int(cleaned) if cleaned else None
    except (ValueError, TypeError):
        return None

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = [
        re.sub(r"_+", "_", re.sub(r"[^A-Z0-9]", "_", col.upper().strip())).strip("_")
        for col in df.columns
    ]
    df.columns = new_cols
    return df

# ---------------- PROCESS MASTER DB ----------------

columns_to_keep = [
    "Company Registration Number (UEN)",
    "ACRA REGISTERED NAME",
    "Brand/Deal Name/Business Name",
    "Primary SSIC Code",
    "PIC NAME 1 Contact Number",
    "PIC 1 email address",
    "Website URL",
    "Parent Industry Type",
    "Sub Industry"
]

existing_cols = [c for c in columns_to_keep if c in Master_DB_df.columns]
if not existing_cols:
    raise ValueError("None of the required columns found in the dataframe")

master_db_df = Master_DB_df[existing_cols].copy()
master_db_df = standardize_columns(master_db_df)

uen_cols = [c for c in master_db_df.columns if "UEN" in c]
if not uen_cols:
    raise ValueError("UEN column not found after standardization")

uen_col = uen_cols[0]

master_db_df["UEN"] = (
    master_db_df[uen_col].astype(str).str.upper().str.replace(r"[^A-Z0-9]", "", regex=True)
)
master_db_df["UEN"] = master_db_df["UEN"].replace(['', 'NAN', 'NONE'], None)
master_db_df = master_db_df.drop(columns=[uen_col])

rename_map = {
    "BRAND_DEAL_NAME_BUSINESS_NAME": "BRAND_NAME",
    "PRIMARY_SSIC_CODE": "SSIC_CODE",
}

master_db_df = master_db_df.rename(columns={k: v for k, v in rename_map.items() if k in master_db_df.columns})

for col in ["ACRA_REGISTERED_NAME", "BRAND_NAME"]:
    if col in master_db_df.columns:
        master_db_df[col] = (
            master_db_df[col].astype(str).str.strip().str.upper().replace(['', 'NAN', 'NONE'], None)
        )

if "SSIC_CODE" in master_db_df.columns:
    master_db_df["SSIC_CODE"] = master_db_df["SSIC_CODE"].apply(clean_ssic_code)

required_cols = ["UEN", "ACRA_REGISTERED_NAME", "BRAND_NAME", "SSIC_CODE"]
available_cols = [c for c in required_cols if c in master_db_df.columns]
master_db_df = master_db_df[available_cols].copy()

master_db_df = master_db_df[
    master_db_df["UEN"].notna() & (master_db_df["UEN"].astype(str).str.strip() != "")
]

print(f"Final dataset: {len(master_db_df)} rows, {len(master_db_df.columns)} columns")
master_db_df

Fetching data from Google Sheets (attempt 1/3)...
✓ Data fetched successfully!
Successfully loaded 13186 rows and 63 columns
Final dataset: 12519 rows, 4 columns


Unnamed: 0,UEN,ACRA_REGISTERED_NAME,BRAND_NAME,SSIC_CODE
0,04799400B,AIK BEE TEXTILE CO,AIK BEE TEXTILE CO,46411.0
1,03376200K,SERANGOON GARDEN CLINIC AND DISPENSARY,GARDEN CLINIC,550263.0
2,06239600E,SALON DE BENZIMEN,SALON DE BENZIMEN,96021.0
3,06952000C,SU LAN LADIES FASHION,SU LAN LADIES FASHION,14103.0
4,10381600C,SIN HAI PRINTING SERVICE,SIN HAI PRINTING SERVICE,18113.0
...,...,...,...,...
13181,202325647M,LUNCH & LEARN EVENTS PTE. LTD.,LUNCH & LEARN EVENTS PTE. LTD.,
13182,198501579M,LUXURY TOURS & TRAVEL PTE LTD,LUXURY TOURS & TRAVEL PTE LTD,
13183,202524125D,BOTANI BEAUTY STORY PTE. LTD.,BOTANI BEAUTY STORY PTE. LTD.,
13184,202534441G,BLOOM HAIR STUDIO PTE. LTD.,BLOOM HAIR STUDIO PTE. LTD.,


### Getting ACRA Data (Filter by Live, Live Company only & non relevant ssic code)
- last downloaded oct 25

In [28]:

folder_path = "Acra_Data"

# Get all CSV file paths inside the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read and combine all CSVs
# Using low_memory=False to avoid DtypeWarning for mixed types
df = pd.concat((pd.read_csv(f, low_memory=False) for f in csv_files), ignore_index=True)


df.columns = df.columns.str.upper()


acra_data = df[[
    "UEN",
    "ENTITY_NAME",
    "BUSINESS_CONSTITUTION_DESCRIPTION",
    "ENTITY_TYPE_DESCRIPTION",
    "ENTITY_STATUS_DESCRIPTION",
    "REGISTRATION_INCORPORATION_DATE",
    "PRIMARY_SSIC_CODE",
    "SECONDARY_SSIC_CODE",
    "STREET_NAME",
    "POSTAL_CODE"
]].copy()

# Convert to proper data types
acra_data['UEN'] = acra_data['UEN'].astype('string')
acra_data['ENTITY_NAME'] = acra_data['ENTITY_NAME'].astype('string')
acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'] = acra_data['BUSINESS_CONSTITUTION_DESCRIPTION'].astype('string')
acra_data['ENTITY_TYPE_DESCRIPTION'] = acra_data['ENTITY_TYPE_DESCRIPTION'].astype('string')
acra_data['ENTITY_STATUS_DESCRIPTION'] = acra_data['ENTITY_STATUS_DESCRIPTION'].astype('string')
acra_data['REGISTRATION_INCORPORATION_DATE'] = pd.to_datetime(acra_data['REGISTRATION_INCORPORATION_DATE'], errors='coerce')

# Clean string columns — trim, remove extra spaces, uppercase
for col in [
    'UEN',
    'ENTITY_NAME',
    'BUSINESS_CONSTITUTION_DESCRIPTION',
    'ENTITY_TYPE_DESCRIPTION',
    'ENTITY_STATUS_DESCRIPTION',
    'STREET_NAME',
    'POSTAL_CODE'
]:
    acra_data[col] = (
        acra_data[col]
        .fillna('')
        .str.strip()
        .str.replace(r'\s+', ' ', regex=True)
        .str.upper()
    )

# Replace placeholders with NaN for standardization
acra_data.replace(['NA', 'N/A', '-', ''], np.nan, inplace=True)

# Convert registration date to dd-mm-yyyy string (optional)
acra_data['REGISTRATION_INCORPORATION_DATE'] = acra_data['REGISTRATION_INCORPORATION_DATE'].dt.strftime('%d-%m-%Y')

# Filter only live entities (LIVE COMPANY or LIVE)
acra_data = acra_data[
    acra_data['ENTITY_STATUS_DESCRIPTION'].isin(['LIVE COMPANY', 'LIVE'])
].reset_index(drop=True)

# Exclude specific PRIMARY_SSIC_CODE values (supposedly the data would be 600k plus but when we exclude this would lessen)
exclude_codes = [
    46900, 47719, 47749, 47539, 47536, 56123,
    10711, 10712, 10719, 10732, 10733, 93209
]

acra_data = acra_data[~acra_data['PRIMARY_SSIC_CODE'].isin(exclude_codes)].reset_index(drop=True)

In [29]:
acra_data

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE
0,00182000A,AIK SENG HENG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,07-02-1975,46302,na,FISHERY PORT ROAD,619742
1,00233500W,ASIA STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28-10-1974,46411,20234,SIMS AVENUE,387509
2,00733000J,AIK CHE HIONG,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02-11-1974,32909,46900,ANG MO KIO INDUSTRIAL PARK 2A,568049
3,00927000X,A WALIMOHAMED BROS,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12-11-1974,46411,66126,JELLICOE ROAD,208767
4,01173000E,ANG TECK MOH DEPARTMENT STORE,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30-10-1974,47711,47214,WOODLANDS STREET 12,738623
...,...,...,...,...,...,...,...,...,...,...
537323,T25LL0518K,ZEUS BARBERS LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,16-05-2025,96021,na,KELANTAN LANE,200031
537324,T25LL0858C,ZENSE SPACE LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,01-08-2025,43301,46900,YISHUN INDUSTRIAL STREET 1,768161
537325,T25LL0870A,ZIQZEQ PROCUREMENT LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,04-08-2025,70209,46100,SIN MING LANE,573969
537326,T25LL1049B,ZHONG XIN TRAVEL LLP,,LIMITED LIABILITY PARTNERSHIP,LIVE,08-09-2025,79102,79101,JALAN BAHAGIA,320034


### Getting SSIC Industry code

In [30]:
# --- CONFIG ---
file_path = "./SSIC_Code/mapped_ssic_code.xlsx"

# --- LOAD DATA ---
mapped_ssic_code = pd.read_excel(file_path)

# --- STANDARDIZE COLUMN NAMES ---  
# Uppercase, strip spaces, replace spaces with underscores
mapped_ssic_code.columns = (
    mapped_ssic_code.columns
    .str.strip()
    .str.upper()
    .str.replace(" ", "_")
)

# --- KEEP ONLY DESIRED COLUMNS ---
columns_to_keep = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "SSIC_CODES", "DESCRIPTION"]
mapped_ssic_code = mapped_ssic_code[columns_to_keep].copy()

# --- CLEAN SSIC_CODES COLUMN ---
mapped_ssic_code["SSIC_CODES"] = (
    pd.to_numeric(mapped_ssic_code["SSIC_CODES"], errors="coerce")  # safely convert to numeric
    .fillna(0)
    .astype(int)
)

# --- CLEAN TEXT COLUMNS ---
text_cols = ["PARENT_INDUSTRY", "INDUSTRY_TYPE", "SUB_INDUSTRY", "DESCRIPTION"]
mapped_ssic_code[text_cols] = mapped_ssic_code[text_cols].apply(
    lambda col: col.astype(str).str.strip().str.title()
)

# --- REMOVE DUPLICATES & RESET INDEX ---
mapped_ssic_code = mapped_ssic_code.drop_duplicates().reset_index(drop=True)

mapped_ssic_code.head()


Unnamed: 0,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,SSIC_CODES,DESCRIPTION
0,Retail,Retail,Fashion & Apparel,47711,Retail Sale Of Clothing For Adults
1,Retail,Retail,Fashion & Apparel,47712,Retail Sale Of Children And Infants' Clothing
2,Retail,Retail,Fashion & Apparel,47715,Retail Sale Of Sewing And Clothing Accessories
3,Retail,Retail,Fashion & Apparel,47719,"Retail Sale Of Clothing, Footwear And Leather ..."
4,Retail,Retail,Fashion & Apparel,47510,Retail Sale Of Textiles


### Merge ACRA data with SSIC code

In [31]:
# Convert PRIMARY_SSIC_CODE to int
acra_data["PRIMARY_SSIC_CODE"] = (
    pd.to_numeric(acra_data["PRIMARY_SSIC_CODE"], errors="coerce")
    .fillna(0)
    .astype(int)
)

# Merge based on SSIC code
acra_data_filtered = acra_data.merge(
    mapped_ssic_code,
    how="left",
    left_on="PRIMARY_SSIC_CODE",
    right_on="SSIC_CODES"
)

# Optional: drop the duplicate 'SSIC CODES' column (keep only PRIMARY_SSIC_CODE)
acra_data_filtered = acra_data_filtered.drop(columns=["SSIC_CODES"], errors="ignore")


### FIlter Acra data with Master DB to get list of companies havent been researched  by MR

In [32]:

# Ensure both UEN columns are strings for accurate matching
acra_data_filtered['UEN'] = acra_data_filtered['UEN'].astype(str).str.strip().str.upper()
master_db_df['UEN'] = master_db_df['UEN'].astype(str).str.strip().str.upper()

# Filter out rows in acra_data_filtered whose UEN is already in master_db_df
acra_data_filtered = acra_data_filtered[~acra_data_filtered['UEN'].isin(master_db_df['UEN'])]

acra_data_filtered.shape

(529858, 14)

### Filter by  Industry

-   after 2020

In [33]:
# Event Organizer

ssic_codes = [
    "82303", "82302", "82301", "8230", "823", "82305"
]

# Convert to datetime
acra_data_filtered["REGISTRATION_INCORPORATION_DATE"] = pd.to_datetime(
    acra_data_filtered["REGISTRATION_INCORPORATION_DATE"],
    dayfirst=True,
    errors="coerce"
)

acra_data_filtered_by_industry = acra_data_filtered[
    (
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live") |
        (acra_data_filtered["ENTITY_STATUS_DESCRIPTION"].str.lower() == "live company")
    )
    &
    (acra_data_filtered["PRIMARY_SSIC_CODE"].astype(str).isin(ssic_codes))
]


# Optional: filter by date > 2020
acra_data_filtered_by_industry = acra_data_filtered_by_industry[
    acra_data_filtered_by_industry["REGISTRATION_INCORPORATION_DATE"] > "2020-01-01"
]

# Convert date to dd/mm/yyyy format
acra_data_filtered_by_industry["REGISTRATION_INCORPORATION_DATE"] = (
    acra_data_filtered_by_industry["REGISTRATION_INCORPORATION_DATE"].dt.strftime("%d/%m/%Y")
)

print(acra_data_filtered_by_industry.shape)
acra_data_filtered_by_industry

(2288, 14)


Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
18930,202001182D,AH LIM & SON PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,08/01/2020,82303,70201,YISHUN STREET 23,768441,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19195,202006699G,ASIA INTERNATIONAL CULTURE AND TOURISM GROUP P...,,LOCAL COMPANY,LIVE COMPANY,28/02/2020,82303,64202,VICTORIA PARK GROVE,266165,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19243,202007512K,ARTYPARTY PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,05/03/2020,82303,74905,HOUGANG STREET 91,530972,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19280,202008052E,ADEPTWERKS PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,09/03/2020,82303,70201,HENDERSON ROAD,159545,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19343,202009378R,ADCC SOUTHEAST ASIA PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,21/03/2020,82303,85407,ANSON ROAD,79903,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535976,202534248N,ZENO (S) PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,06/08/2025,82303,62021,PAYA LEBAR ROAD,409051,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
536785,53448554E,ZEEYX,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,30/03/2022,82302,na,BUKIT DRIVE,587844,Services,Services,Event Management & Organisers,Convention/Conference/Corporate Meeting Organi...
536914,53477731K,ZHENGKANG EVENT CONSULTANCY,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,12/12/2023,82303,56111,CASSIA CRESCENT,391030,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
536918,53478473M,ZYG EVENTS AND ENTERTAINMENT,PARTNERSHIP,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28/12/2023,82303,59113,COMPASSVALE LANE,541205,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."


In [34]:
acra_data_filtered_by_industry.head(10)

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
18930,202001182D,AH LIM & SON PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,08/01/2020,82303,70201,YISHUN STREET 23,768441,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19195,202006699G,ASIA INTERNATIONAL CULTURE AND TOURISM GROUP P...,,LOCAL COMPANY,LIVE COMPANY,28/02/2020,82303,64202,VICTORIA PARK GROVE,266165,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19243,202007512K,ARTYPARTY PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,05/03/2020,82303,74905,HOUGANG STREET 91,530972,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19280,202008052E,ADEPTWERKS PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,09/03/2020,82303,70201,HENDERSON ROAD,159545,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19343,202009378R,ADCC SOUTHEAST ASIA PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,21/03/2020,82303,85407,ANSON ROAD,79903,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19387,202010236Z,AV PEDIA PRIVATE LIMITED,,LOCAL COMPANY,LIVE COMPANY,31/03/2020,82303,42101,SOUTH BRIDGE ROAD,58727,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19429,202010783D,ASIA LIVE GROUP PRIVATE LIMITED,,LOCAL COMPANY,LIVE COMPANY,04/04/2020,82303,47711,CLEMENTI AVENUE 4,121311,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19912,202022189M,ASSOCIATION FOR HAEMOPHILIA AND ALLIED DISORDE...,,LOCAL COMPANY,LIVE COMPANY,30/07/2020,82302,85409,CECIL STREET,69542,Services,Services,Event Management & Organisers,Convention/Conference/Corporate Meeting Organi...
19939,202022854C,ANYMOMENT PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,05/08/2020,82303,70201,BAYFRONT AVENUE,18972,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
19974,202023722D,ALTERNATE DIMENSION PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,12/08/2020,82303,na,HINDHEDE WALK,587978,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."


### Sort & Group by companies 

In [35]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
import numpy as np

def preprocess_company_name(name):
    """Preprocessing for company names - removes common suffixes and special characters."""
    if pd.isna(name):
        return ""
    
    name = str(name).upper().strip()
    name = re.sub(r'[^\w\s]', ' ', name)
    
    suffixes = [
        'PRIVATE LIMITED', 'PTE LTD', 'PTE. LTD.', 'PTE LTD.', 'PTE. LTD',
        'PVT LTD', 'LIMITED', 'LTD', 'SINGAPORE', 'S.G.', 'SG', 'PTE',
        'COMPANY', 'CO', 'CORPORATION', 'CORP', 'INC', 'INCORPORATED',
        'LLC', 'LLP', 'PROPRIETARY', 'PROP'
    ]
    
    for suffix in suffixes:
        if name.endswith(' ' + suffix):
            name = name[:-len(suffix)-1].strip()
        elif name == suffix:
            name = ""
    
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# Create a temporary column with cleaned names for similarity calculation
acra_data_filtered_by_industry = acra_data_filtered_by_industry.copy()
acra_data_filtered_by_industry['_clean_name'] = acra_data_filtered_by_industry['ENTITY_NAME'].apply(preprocess_company_name)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 5),
    lowercase=True,
    max_df=0.90,
    min_df=1,
    strip_accents='unicode',
    sublinear_tf=True
)

# Transform to TF-IDF vectors using cleaned names
tfidf_matrix = vectorizer.fit_transform(acra_data_filtered_by_industry['_clean_name'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Convert similarity to distance for clustering
distance_matrix = 1 - cosine_sim

# Hierarchical clustering
condensed_dist = squareform(distance_matrix, checks=False)
linkage_matrix = linkage(condensed_dist, method='average')

# Get cluster labels (adjust threshold: lower = stricter grouping)
clusters = fcluster(linkage_matrix, t=0.5, criterion='distance')

# Add cluster to dataframe and sort
acra_data_filtered_by_industry['_cluster'] = clusters
acra_data_filtered_by_industry = acra_data_filtered_by_industry.sort_values(
    ['_cluster', 'ENTITY_NAME']
).reset_index(drop=True)

# Remove temporary columns (cleaned name and cluster)
acra_data_filtered_by_industry = acra_data_filtered_by_industry.drop(['_clean_name', '_cluster'], axis=1)

# Display result
acra_data_filtered_by_industry

Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,53495362W,GAJASNAP,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,02/12/2024,82303,18129,WOODLANDS DRIVE 75,730687,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
1,53464157A,TJA,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,08/03/2023,82302,na,ROBINSON ROAD,68898,Services,Services,Event Management & Organisers,Convention/Conference/Corporate Meeting Organi...
2,202403202D,ALWAYS READY PTE. LIMITED,,LOCAL COMPANY,LIVE COMPANY,23/01/2024,82303,na,HENDERSON ROAD,159545,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
3,53508123E,AAYIRA,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,03/08/2025,82303,na,YISHUN INDUSTRIAL STREET 1,768160,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
4,53485715M,QWUIRKY,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,17/05/2024,82303,73100,ANG MO KIO AVENUE 5,569880,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2283,53474297A,ENIGMA EVENT,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,28/09/2023,82303,59112,WOODLANDS RING ROAD,730661,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
2284,202523563N,EVENTBASE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,29/05/2025,82303,90009,ROBINSON ROAD,068914,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
2285,202340172W,DOKI! DOKI! EVENTS PRIVATE LIMITED,,LOCAL COMPANY,LIVE COMPANY,07/10/2023,82303,73100,VENTURE DRIVE,608526,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
2286,53494844L,SOPHIE ROBERTSON EVENTS,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,20/11/2024,82303,na,SHELFORD ROAD,288423,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."


### Get the sample data

In [36]:
# get sample data 
acra_data_filtered_by_industry = acra_data_filtered_by_industry.sample(n=500).reset_index(drop=True)

print(acra_data_filtered_by_industry.shape)
acra_data_filtered_by_industry.head(10)


(500, 14)


Unnamed: 0,UEN,ENTITY_NAME,BUSINESS_CONSTITUTION_DESCRIPTION,ENTITY_TYPE_DESCRIPTION,ENTITY_STATUS_DESCRIPTION,REGISTRATION_INCORPORATION_DATE,PRIMARY_SSIC_CODE,SECONDARY_SSIC_CODE,STREET_NAME,POSTAL_CODE,PARENT_INDUSTRY,INDUSTRY_TYPE,SUB_INDUSTRY,DESCRIPTION
0,202309418E,FORTY FIVE POSITIVE TECH PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,13/03/2023,82303,na,CIRCULAR ROAD,49422,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
1,53469044X,STG KREW SPECIALISTS,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,15/06/2023,82303,82301,TAMPINES STREET 81,520825,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
2,202448723G,ENGAGE WORKFORCE GROUP PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,27/11/2024,82303,na,ROBINSON ROAD,68877,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
3,202219489Z,IFSG LTD.,,LOCAL COMPANY,LIVE COMPANY,06/06/2022,82303,85405,NEW INDUSTRIAL ROAD,536209,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
4,202509075W,PARTYHOST PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,02/03/2025,82303,na,UPPER SERANGOON ROAD,534779,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
5,202534542M,NOVALINKE PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,07/08/2025,82301,46900,BURN ROAD,369977,Services,Services,Event Management & Organisers,Exhibition Organisers
6,202401320H,JRJ EVENTS PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,09/01/2024,82302,na,CECIL STREET,69538,Services,Services,Event Management & Organisers,Convention/Conference/Corporate Meeting Organi...
7,53461382M,UNTAMED EVENTS,SOLE-PROPRIETOR,SOLE PROPRIETORSHIP/ PARTNERSHIP,LIVE,11/01/2023,82303,47220,STIRLING ROAD,140164,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
8,202204545Z,EDU-SG PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,10/02/2022,82303,46900,SHENTON WAY,68808,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."
9,202123596K,RRE EVENTS PTE. LTD.,,LOCAL COMPANY,LIVE COMPANY,06/07/2021,82303,70209,WOODLANDS DRIVE 52,731624,Services,Services,Event Management & Organisers,"Event Organisers (Except Concerts, Sports/Fitn..."


In [37]:
print(acra_data_filtered_by_industry.shape)
print(acra_data_filtered_by_industry["UEN"].is_unique)


(500, 14)
True


In [38]:
acra_data_filtered_by_industry.to_parquet("./Staging/Bronze/bronze_data_1.parquet", index=False, engine="fastparquet")

In [39]:
# parquet_path = "./Staging/Silver/Silver_data_2_550.parquet"
# if os.path.exists(parquet_path):
#     RecordOwl_Leads = pd.read_parquet(parquet_path, engine="fastparquet")
#     print(f"Loaded {len(RecordOwl_Leads)} rows from {parquet_path}")
#     print(RecordOwl_Leads.shape)
# else:
#     raise FileNotFoundError(f"Parquet file not found at {parquet_path}")


# # Ensure both UEN columns are strings for accurate matching
# acra_data_filtered['UEN'] = acra_data_filtered['UEN'].astype(str).str.strip().str.upper()
# RecordOwl_Leads['UEN'] = RecordOwl_Leads['UEN'].astype(str).str.strip().str.upper()

# # Filter out rows in acra_data_filtered whose UEN is already in RecordOwl_Leads
# acra_data_filtered = acra_data_filtered[~acra_data_filtered['UEN'].isin(RecordOwl_Leads['UEN'])]

# acra_data_filtered.shape