Service Acc: masterdb@axiomatic-atlas-476707-k8.iam.gserviceaccount.com

In [19]:

import os
import pandas as pd
import re


In [None]:

#Service Acc: masterdb@axiomatic-atlas-476707-k8.iam.gserviceaccount.com

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build

sheet_id = '1ipwIl7fciIlddvOUqGLpNlVQufw7Xd26Qa-YuJcx-xE'

SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# Prefer env var GOOGLE_APPLICATION_CREDENTIALS, else fall back to local credentials.json
SERVICE_ACCOUNT_FILE = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "credentials.json")
if not os.path.exists(SERVICE_ACCOUNT_FILE):
    raise FileNotFoundError(
        f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'. "
        "Set GOOGLE_APPLICATION_CREDENTIALS to the full path, or place credentials.json next to this notebook."
    )

credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('sheets', 'v4', credentials=credentials)
sheet = service.spreadsheets()


# Using a large range to ensure we get all data (sheets typically don't exceed 1000 columns)
range_a1 = "'MASTER DATABASE 2025 Template'!A:ZZ"
result = sheet.values().get(spreadsheetId=sheet_id, range=range_a1).execute()
values = result.get('values', [])

if values:
    header = values[0]
    data_rows = values[1:]
    
    # Find the maximum number of columns across all rows
    max_len = max([len(header)] + [len(r) for r in data_rows]) if data_rows else len(header)
    
    # Extend header if data rows have more columns
    if len(header) < max_len:
        header = header + [f'col_{i+1}' for i in range(len(header), max_len)]
    
    # Normalize all rows to have the same length
    normalized_rows = [row + [''] * (max_len - len(row)) for row in data_rows]
    
    Master_DB_df = pd.DataFrame(normalized_rows, columns=header)
    print(f"Successfully loaded {len(Master_DB_df)} rows and {len(Master_DB_df.columns)} columns")
else:
    Master_DB_df = pd.DataFrame()
    print("No data found in the sheet")

# --- CLEANING AND STANDARDIZING THE MASTER DB ---

# --- HELPER FUNCTIONS ---
def clean_uen(u: str) -> str | None:
    """Clean UEN: remove non-alphanumeric, convert to uppercase."""
    if pd.isna(u) or u == '':
        return None
    cleaned = re.sub(r"[^A-Z0-9]", "", str(u).upper().strip())
    return None if cleaned == '' else cleaned

def clean_text(text: str) -> str | None:
    """Clean text: strip, uppercase, handle NaN values."""
    if pd.isna(text) or text == '':
        return None
    text = str(text).strip().upper()
    return None if text in ('', 'NAN', 'NONE') else text

def clean_ssic_code(value) -> int | None:
    """Convert SSIC code to integer, handling empty strings and invalid values."""
    if pd.isna(value) or value == '':
        return None
    try:
        # Remove any non-numeric characters and convert
        cleaned = re.sub(r"[^0-9]", "", str(value).strip())
        return int(cleaned) if cleaned else None
    except (ValueError, TypeError):
        return None

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Convert column names to uppercase, replace non-alphanumeric with single underscore."""
    new_cols = [
        re.sub(r"_+", "_", re.sub(r"[^A-Z0-9]", "_", col.upper().strip())).strip("_")
        for col in df.columns
    ]
    df.columns = new_cols
    return df

# --- PROCESS DATA ---
# Select relevant columns first (more efficient than copying entire dataframe)
columns_to_keep = [
    "Company Registration Number (UEN)",
    "ACRA REGISTERED NAME",
    "Brand/Deal Name/Business Name",
    "Primary SSIC Code",
    "PIC NAME 1 Contact Number",
    "PIC 1 email address",
    "Website URL",
    "Parent Industry Type",
    "Sub Industry"
]

# Filter columns that exist in the dataframe
existing_cols = [c for c in columns_to_keep if c in Master_DB_df.columns]
if not existing_cols:
    raise ValueError("None of the required columns found in the dataframe")

master_db_df = Master_DB_df[existing_cols].copy()

# Standardize column names
master_db_df = standardize_columns(master_db_df)

# Find and process UEN column
uen_cols = [c for c in master_db_df.columns if "UEN" in c]
if not uen_cols:
    raise ValueError("UEN column not found after standardization")
uen_col = uen_cols[0]

# Clean UEN using vectorized operations (faster than apply)
master_db_df["UEN"] = master_db_df[uen_col].astype(str).str.upper().str.replace(r"[^A-Z0-9]", "", regex=True)
master_db_df["UEN"] = master_db_df["UEN"].replace(['', 'NAN', 'NONE'], None)
master_db_df = master_db_df.drop(columns=[uen_col])

# Rename columns
rename_map = {
    "BRAND_DEAL_NAME_BUSINESS_NAME": "BRAND_NAME",
    "PRIMARY_SSIC_CODE": "SSIC_CODE",
}
master_db_df = master_db_df.rename(columns={k: v for k, v in rename_map.items() if k in master_db_df.columns})

# Clean text columns using vectorized operations
for col in ["ACRA_REGISTERED_NAME", "BRAND_NAME"]:
    if col in master_db_df.columns:
        master_db_df[col] = (
            master_db_df[col].astype(str).str.strip().str.upper()
            .replace(['', 'NAN', 'NONE'], None)
        )

# Convert SSIC_CODE to integer (handles empty strings and invalid values)
if "SSIC_CODE" in master_db_df.columns:
    master_db_df["SSIC_CODE"] = master_db_df["SSIC_CODE"].apply(clean_ssic_code)

# Keep only required columns that exist
required_cols = ["UEN", "ACRA_REGISTERED_NAME", "BRAND_NAME", "SSIC_CODE"]
available_cols = [c for c in required_cols if c in master_db_df.columns]
master_db_df = master_db_df[available_cols].copy()

# Filter out rows with missing or empty UEN
master_db_df = master_db_df[
    master_db_df["UEN"].notna() & 
    (master_db_df["UEN"].astype(str).str.strip() != "")
].copy()

print(f"Final dataset: {len(master_db_df)} rows, {len(master_db_df.columns)} columns")
master_db_df



Successfully loaded 10462 rows and 63 columns
Final dataset: 9800 rows, 4 columns


Unnamed: 0,UEN,ACRA_REGISTERED_NAME,BRAND_NAME,SSIC_CODE
0,04799400B,AIK BEE TEXTILE CO,AIK BEE TEXTILE CO,46411.0
1,03376200K,SERANGOON GARDEN CLINIC AND DISPENSARY,GARDEN CLINIC,550263.0
2,06239600E,SALON DE BENZIMEN,SALON DE BENZIMEN,96021.0
3,06952000C,SU LAN LADIES FASHION,SU LAN LADIES FASHION,14103.0
4,10381600C,SIN HAI PRINTING SERVICE,SIN HAI PRINTING SERVICE,18113.0
...,...,...,...,...
10457,202306640R,MISTER MOBILE POTONG PASIR PTE. LTD.,MISTER MOBILE (POTONG PASIR),47411.0
10458,202103010N,MISTER MOBILE JURONG PTE. LTD.,MISTER MOBILE (JURONG),47411.0
10459,201734006N,MISTER MOBILE HOUGANG PTE. LTD.,MISTER MOBILE (HOUGANG),95120.0
10460,202210879W,MISTER MOBILE CHINATOWN PTE. LTD.,MISTER MOBILE (CHINATOWN),47411.0
