In [44]:
import os
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import time
import scrapy
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
import nest_asyncio
import glob
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
import re


In [45]:
Fresh_Leads_formatted = pd.read_csv("Golden_Data/Fresh_Leads_with_PhoneNumber_Nov11.csv")


In [46]:
# Remove duplicate Website URLs - keep first occurrence, set duplicates to NaN
if 'Website URL' in Fresh_Leads_formatted.columns:
    # Count duplicates before removal
    total_rows = len(Fresh_Leads_formatted)
    valid_urls = Fresh_Leads_formatted['Website URL'].notna()
    unique_urls_before = Fresh_Leads_formatted[valid_urls]['Website URL'].nunique()
    duplicate_count = Fresh_Leads_formatted[valid_urls]['Website URL'].duplicated().sum()
    
    print(f"Before removing duplicates:")
    print(f"  Total rows: {total_rows}")
    print(f"  Rows with Website URL: {valid_urls.sum()}")
    print(f"  Unique URLs: {unique_urls_before}")
    print(f"  Duplicate URLs: {duplicate_count}")
    
    # Mark duplicates (keep first occurrence, mark subsequent as duplicates)
    # Convert to string and normalize (lowercase, strip whitespace) for comparison
    Fresh_Leads_formatted['Website URL_cleaned'] = Fresh_Leads_formatted['Website URL'].astype(str).str.strip().str.lower()
    
    # Find duplicates (keep first occurrence)
    is_duplicate = Fresh_Leads_formatted['Website URL_cleaned'].duplicated(keep='first')
    
    # Set duplicate URLs to NaN (excluding rows where URL was already NaN)
    mask_to_remove = is_duplicate & (Fresh_Leads_formatted['Website URL'].notna())
    Fresh_Leads_formatted.loc[mask_to_remove, 'Website URL'] = pd.NA
    
    # Drop the temporary cleaning column
    Fresh_Leads_formatted = Fresh_Leads_formatted.drop(columns=['Website URL_cleaned'])
    
    # Count after removal
    valid_urls_after = Fresh_Leads_formatted['Website URL'].notna()
    unique_urls_after = Fresh_Leads_formatted[valid_urls_after]['Website URL'].nunique()
    
    print(f"\nAfter removing duplicates:")
    print(f"  Rows with Website URL: {valid_urls_after.sum()}")
    print(f"  Unique URLs: {unique_urls_after}")
    print(f"  Duplicates removed: {duplicate_count}")
    print(f"  Rows with duplicate URLs set to NaN: {mask_to_remove.sum()}")
else:
    print("Warning: 'Website URL' column not found in dataframe")


Before removing duplicates:
  Total rows: 105
  Rows with Website URL: 64
  Unique URLs: 64
  Duplicate URLs: 0

After removing duplicates:
  Rows with Website URL: 64
  Unique URLs: 64
  Duplicates removed: 0
  Rows with duplicate URLs set to NaN: 0


In [47]:
# Remove duplicate Facebook Page, Instagram URL, and PIC 1 email address - keep first occurrence, set duplicates to NaN
columns_to_clean = ['Facebook Page', 'Instagram URL', 'PIC 1 email address']

for col_name in columns_to_clean:
    if col_name in Fresh_Leads_formatted.columns:
        print(f"\n{'='*60}")
        print(f"Processing: {col_name}")
        print(f"{'='*60}")
        
        # Count duplicates before removal
        valid_values = Fresh_Leads_formatted[col_name].notna()
        unique_before = Fresh_Leads_formatted[valid_values][col_name].nunique()
        duplicate_count = Fresh_Leads_formatted[valid_values][col_name].duplicated().sum()
        
        print(f"Before removing duplicates:")
        print(f"  Rows with {col_name}: {valid_values.sum()}")
        print(f"  Unique values: {unique_before}")
        print(f"  Duplicate values: {duplicate_count}")
        
        # Normalize values for comparison (lowercase, strip whitespace)
        temp_col = f'{col_name}_cleaned'
        Fresh_Leads_formatted[temp_col] = Fresh_Leads_formatted[col_name].astype(str).str.strip().str.lower()
        
        # Find duplicates (keep first occurrence)
        is_duplicate = Fresh_Leads_formatted[temp_col].duplicated(keep='first')
        
        # Set duplicate values to NaN (excluding rows where value was already NaN)
        mask_to_remove = is_duplicate & (Fresh_Leads_formatted[col_name].notna())
        Fresh_Leads_formatted.loc[mask_to_remove, col_name] = pd.NA
        
        # Drop the temporary cleaning column
        Fresh_Leads_formatted = Fresh_Leads_formatted.drop(columns=[temp_col])
        
        # Count after removal
        valid_after = Fresh_Leads_formatted[col_name].notna()
        unique_after = Fresh_Leads_formatted[valid_after][col_name].nunique()
        
        print(f"\nAfter removing duplicates:")
        print(f"  Rows with {col_name}: {valid_after.sum()}")
        print(f"  Unique values: {unique_after}")
        print(f"  Duplicates removed: {mask_to_remove.sum()}")
        print(f"  ✅ Successfully removed {mask_to_remove.sum()} duplicate values")
    else:
        print(f"⚠️  Warning: '{col_name}' column not found in dataframe")

print(f"\n{'='*60}")
print("Duplicate removal complete for Facebook Page, Instagram URL, and PIC 1 email address!")
print(f"{'='*60}")



Processing: Facebook Page
Before removing duplicates:
  Rows with Facebook Page: 32
  Unique values: 32
  Duplicate values: 0

After removing duplicates:
  Rows with Facebook Page: 32
  Unique values: 32
  Duplicates removed: 0
  ✅ Successfully removed 0 duplicate values

Processing: Instagram URL
Before removing duplicates:
  Rows with Instagram URL: 22
  Unique values: 22
  Duplicate values: 0

After removing duplicates:
  Rows with Instagram URL: 22
  Unique values: 22
  Duplicates removed: 0
  ✅ Successfully removed 0 duplicate values

Processing: PIC 1 email address
Before removing duplicates:
  Rows with PIC 1 email address: 47
  Unique values: 42
  Duplicate values: 5

After removing duplicates:
  Rows with PIC 1 email address: 42
  Unique values: 42
  Duplicates removed: 5
  ✅ Successfully removed 5 duplicate values

Duplicate removal complete for Facebook Page, Instagram URL, and PIC 1 email address!


In [48]:
Fresh_Leads_formatted.columns.tolist()

['ePOS Code',
 'Company Code',
 'Date',
 'ACRA REGISTERED NAME',
 'Brand/Deal Name/Business Name',
 'Sub Domain Link (If Lead is already available in Backend) Fill only when EPOS client',
 'Tele Sales or MR (For KPI - Internal)',
 'Name of the Market Researcher',
 'Original Source (Marketing)',
 'Marketing Source (Do not fill anything if the leads are from Hubspot, EPOS clients)',
 'Company Registration date / Date Established',
 'Company Registration Number (UEN)',
 'Primary SSIC Code',
 'Secondary SSIC Code',
 'Hubspot ID (Company)',
 'Hubspot ID(Deal)',
 'Hubspot ID(Contact)',
 'Website URL',
 'Business Type',
 'Facebook Page',
 'Instagram URL',
 'Linkedin URL',
 'Tik Tok URL',
 'Ownership Type',
 'Parent Industry Type',
 'Industry Type',
 'Sub Industry',
 'Business model',
 'Presence of Multiple Outlets',
 'Number of Outlets (Write in #)',
 'Region',
 'Planning Area',
 'Business Location Type',
 'Registered Address (Block & Street)',
 'Registered Address  (Unit #)',
 'Registered Ad

In [49]:
# AGGRESSIVE DATE REMOVAL for Operational Address (Unit #) - Remove ALL dates!
# Find the column
op_col = None
for col in Fresh_Leads_formatted.columns:
    if 'operational' in col.lower() and 'unit' in col.lower() and 'address' in col.lower():
        op_col = col
        break

if op_col:
    print(f"AGGRESSIVELY cleaning: '{op_col}'\n")
    s = Fresh_Leads_formatted[op_col].astype(str)
    to_remove = []
    month_names = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec']
    
    for idx, val in s.items():
        if pd.isna(val) or str(val).strip().lower() in ['nan', 'none', '', 'nat']:
            continue
        val_str = str(val).strip()
        val_lower = val_str.lower()
        
        # KEEP only if it has a letter (02A, 12W, 330G) - these are unit numbers
        if re.search(r'[A-Za-z]', val_str):
            continue  # Has letter = unit number, KEEP IT
        
        # No letters - check if it's a date and REMOVE
        is_date = False
        
        # Check for month names
        for month in month_names:
            if month in val_lower:
                is_date = True
                break
        
        # Check date patterns
        if re.search(r'\d{4}[-/]\d{1,2}[-/]\d{1,2}', val_str):
            is_date = True
        elif re.search(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', val_str):
            is_date = True
        elif re.match(r'^\d{1,2}[-/]\d{1,2}$', val_str):
            # Could be MM/DD - check if ranges suggest date
            parts = re.split(r'[-/]', val_str)
            if len(parts) == 2:
                try:
                    p1, p2 = int(parts[0]), int(parts[1])
                    if (1 <= p1 <= 31) and (1 <= p2 <= 12):
                        is_date = True
                except:
                    pass
        
        # Try pandas date parsing
        if not is_date:
            try:
                pd.to_datetime(val_str, errors='raise')
                is_date = True
            except:
                pass
        
        if is_date:
            to_remove.append(idx)
    
    # Remove all dates
    print(f"Removing {len(to_remove)} date values...")
    if to_remove:
        print("Examples of dates being removed:")
        for idx in to_remove[:15]:
            print(f"  Row {idx}: {s.loc[idx]}")
        if len(to_remove) > 15:
            print(f"  ... and {len(to_remove) - 15} more")
    
    Fresh_Leads_formatted.loc[to_remove, op_col] = pd.NA
    
    print(f"\n✅ Done! Removed {len(to_remove)} dates.")
    print(f"Remaining non-null values: {Fresh_Leads_formatted[op_col].notna().sum()}")
    print("\nSample of remaining values:")
    print(Fresh_Leads_formatted[Fresh_Leads_formatted[op_col].notna()][[op_col]].head(15))
else:
    print("Column not found!")


AGGRESSIVELY cleaning: 'Operational Address 
(Unit #)'

Removing 0 date values...

✅ Done! Removed 0 dates.
Remaining non-null values: 96

Sample of remaining values:
   Operational Address \n(Unit #)
0                               4
1                               8
2                             590
3                               1
4                               3
5                              76
6                             02A
7                             309
8                               7
9                               4
10                            103
11                            607
12                             31
13                              1
14                              0


In [50]:
Fresh_Leads_formatted.to_csv("Golden_Data/Fresh_Leads_with_PhoneNumber_Nov11.csv", index=False)