### Facebook & Website Mining (Silver 2)

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from Apify_Scrapper_Functions import Get_Phone_Number_From_Facebook, Get_Phone_Number_From_Website


ModuleNotFoundError: No module named 'Apify_Scrapper_Functions'

### Import scrapped data

In [2]:
CarMotor_total_uncleaned_df = pd.read_parquet("./Staging/Silver/CarMotor_total_uncleaned.parquet", engine="fastparquet")

# Check if UEN is unique
print(CarMotor_total_uncleaned_df["UEN"].is_unique)
print(CarMotor_total_uncleaned_df.shape)

True
(1050, 16)


### Separate W/O Phone Numbers

In [3]:
# Separate rows with and without phone numbers
df_with_phones = CarMotor_total_uncleaned_df[CarMotor_total_uncleaned_df["Phones"].notna()]
df_without_phones = CarMotor_total_uncleaned_df[CarMotor_total_uncleaned_df["Phones"].isna()]

# Store unique phone numbers inside final_df_1
final_df_1 = df_with_phones[df_with_phones["Phones"].duplicated(keep=False) == False]

# Store duplicate phone numbers inside refilter_df_1
refilter_df_1 = df_with_phones[df_with_phones["Phones"].duplicated(keep=False) == True]

# Print shapes
print("Total having Phone numbers:", df_with_phones.shape)
print("Total without Phone numbers:", df_without_phones.shape)
print("Unique phone rows (final_df_1):", final_df_1.shape)
print("Duplicate phone rows (refilter_df_1):", refilter_df_1.shape)


Total having Phone numbers: (536, 16)
Total without Phone numbers: (514, 16)
Unique phone rows (final_df_1): (482, 16)
Duplicate phone rows (refilter_df_1): (54, 16)


In [None]:
print(final_df_1["Phones"].astype(str).is_unique)


### Merge refilter_df_1 with df_without_phones

In [None]:
df_without_phones_2 = (
    pd.concat([refilter_df_1, df_without_phones], ignore_index=True)
      .drop_duplicates(subset=["UEN"])
)

df_without_phones_2.shape

### Separate W/O Facebook

In [None]:
# DataFrame with Facebook (not null and not empty)
df_with_facebook = df_without_phones_2[
    df_without_phones_2["Facebook"].notna() &
    (df_without_phones_2["Facebook"] != "")
]

# DataFrame without Facebook (null or empty)
df_without_facebook = df_without_phones_2[
    df_without_phones_2["Facebook"].isna() |
    (df_without_phones_2["Facebook"] == "")
]

# Check the shapes
print(f"With Facebook: {df_with_facebook.shape}")
print(f"Without Facebook: {df_without_facebook.shape}")

In [None]:
print(df_with_facebook["UEN"].is_unique)


In [None]:
df_with_facebook.shape

### Extract Phone Number of Facebook Pages

In [None]:

unique_phone_df, no_phone_df = Get_Phone_Number_From_Facebook(df_with_facebook)


In [None]:
print(unique_phone_df.shape)
print(no_phone_df.shape)


print(unique_phone_df["Phones"].is_unique)
print(unique_phone_df["UEN"].is_unique)

print(no_phone_df["Phones"].is_unique)
print(no_phone_df["UEN"].is_unique)

In [None]:
no_phone_df.shape

In [None]:
final_df_2 = unique_phone_df.copy()
final_df_3 = pd.concat([final_df_1, final_df_2], ignore_index=True)

df_without_phones_3 = (
    pd.concat([df_without_facebook, no_phone_df], ignore_index=True)
      .drop_duplicates(subset=["UEN"])
)


print(final_df_3.shape)
print(df_without_phones_3.shape)

In [None]:
print(final_df_3["UEN"].is_unique)
print(df_without_phones_3["UEN"].is_unique)

### Separate W/O Websites

In [None]:
# DataFrame with Websites (not null and not empty)
df_with_websites = df_without_phones_3[
    df_without_phones_3["Website"].notna() &
    (df_without_phones_3["Website"] != "")
]

# DataFrame without Websites (null or empty)
df_without_websites = df_without_phones_3[
    df_without_phones_3["Website"].isna() |
    (df_without_phones_3["Website"] == "")
]

# Check the shapes
print(f"With Websites: {df_with_websites.shape}")
print(f"Without Websites: {df_without_websites.shape}")
print(f"Total from df_without_phones_3: {df_without_phones_3.shape}")

### Extract Phone Numbers from Websites

In [None]:
scrapped_from_websites = Get_Phone_Number_From_Website(df_with_websites)

In [None]:
# Ensure Phones column exists and convert to string safely
phones = scrapped_from_websites["Phones"].fillna("").astype(str)

# Compute counts once (very efficient)
counts = phones.value_counts(dropna=False)

# Boolean masks (fast vectorized operations)
mask_unique = phones.isin(counts[counts == 1].index)
mask_non_unique = phones.isin(counts[counts > 1].index)

# Create output DataFrames
unique_df = scrapped_from_websites[mask_unique].copy()
non_unique_df = scrapped_from_websites[mask_non_unique].copy()

# Reset index for cleaner output
unique_df.reset_index(drop=True, inplace=True)
non_unique_df.reset_index(drop=True, inplace=True)


In [None]:
non_unique_df.shape

In [None]:
unique_df.shape

### merge with final df

In [None]:
# Ensure columns are strings
final_df_3["UEN"] = final_df_3["UEN"].astype(str)
final_df_3["Phones"] = final_df_3["Phones"].astype(str)
unique_df["UEN"] = unique_df["UEN"].astype(str)
unique_df["Phones"] = unique_df["Phones"].astype(str)

# 1. Identify duplicates (conflicts)
mask_duplicate = (
    unique_df["UEN"].isin(final_df_3["UEN"]) |
    unique_df["Phones"].isin(final_df_3["Phones"])
)

# Rows that cannot be appended
unique_conflicts_df = unique_df[mask_duplicate].copy()

# Rows safe to append
unique_clean_df = unique_df[~mask_duplicate].copy()

# 2. Create final_df_4 (new DataFrame)
final_df_4 = pd.concat([final_df_3, unique_clean_df], ignore_index=True)

# 3. Append conflict rows into non_unique_df
non_unique_df = pd.concat([non_unique_df, unique_conflicts_df], ignore_index=True)

# Clean index
final_df_4.reset_index(drop=True, inplace=True)
non_unique_df.reset_index(drop=True, inplace=True)


In [None]:
non_unique_df.shape

In [None]:
df_without_phones_4 = pd.concat(
    [df_without_websites, non_unique_df],
    ignore_index=True
)

# Drop duplicate UEN
df_without_phones_4 = df_without_phones_4.drop_duplicates(
    subset=["UEN"],
    keep="first"
).reset_index(drop=True)

df_without_phones_4.shape


In [None]:
df_without_phones_4["UEN"].is_unique

In [None]:
final_df_4["UEN"].is_unique

In [None]:
final_df_4["UEN"].is_unique

In [None]:
final_df_4["Phones"].is_unique

### Saving

In [None]:
final_df_4.to_parquet("./Staging/Gold/carmotor_scrapped_2_data.parquet", index=False, engine="fastparquet")

In [None]:
df_without_phones_4.to_parquet("./Staging/Gold/carmotor_scrapped_2_no_contact_data.parquet", index=False, engine="fastparquet")