In [17]:
import pandas as pd
import numpy as np
from typing import Dict, List


In [None]:
class ColumnPrioritizer:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        
        # Hard-coded duplicate column groups based on the Salesforce dataset
        self.duplicate_groups = {
            'account_name': ['Account Name', 'account_name', 'AccountName'],
            'contact_email': ['Contact Email', 'contact_email'],
            'created_date': ['Created Date', 'created_date'],
            'lead_source': ['Lead Source', 'lead_source'],
            'opportunity_amount': ['Opportunity Amount', 'opportunity_amount'],
            'is_active': ['Is Active', 'is_active'],
            'sfdc_id': ['SFDC ID', 'sfdc_id'],
            'annual_revenue': ['Annual Revenue', 'annual_revenue']
        }
    
    def completeness_score(self, column_name: str) -> float:
        # Calculate completeness score for a column (0-1).
        if column_name not in self.df.columns:
            return 0.0
        return self.df[column_name].notna().sum() / len(self.df)
    
    def is_valid_email(self, email: str) -> bool:
        # Check if email is one of the valid emails: help@globex.com or contact@acme.com
        if pd.isna(email) or email is None:
            return False
        return email in ['help@globex.com', 'contact@acme.com']
    
    def is_placeholder_email(self, email: str) -> bool:
        # Check if email is a placeholder (empty, None, or obvious placeholder)"""
        if pd.isna(email) or email is None or email == '':
            return True
        # Common placeholder patterns
        placeholders = ['noemail', 'invalid@', 'user@', 'missing.com', 'placeholder']
        return any(placeholder in str(email).lower() for placeholder in placeholders)
    
    
    

In [21]:
df = pd.read_csv('data/DirtySalesforceData.csv')
prioritizer = ColumnPrioritizer(df)

account_columns = ['Account Name', 'account_name', 'AccountName']
print("Account Name Columns Analysis:")
for col in account_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

contact_columns = ['Contact Email', 'contact_email']
print("Contact Email Columns Analysis:")
for col in contact_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")


Account Name Columns Analysis:

Account Name:
  Completeness: 0.596

account_name:
  Completeness: 0.822

AccountName:
  Completeness: 0.702
Contact Email Columns Analysis:

Contact Email:
  Completeness: 0.754

contact_email:
  Completeness: 0.764


In [None]:
class Tester:
    def is_valid_email(self, email):
        if pd.isna(email) or email is None:
            return False
        return email in ['help@globex.com', 'contact@acme.com']
    
    def is_placeholder_email(self, email):
        if pd.isna(email) or email is None or email == '':
            return True
        placeholders = ['noemail', 'invalid@', 'user@', 'missing.com', 'placeholder']
        return any(placeholder in str(email).lower() for placeholder in placeholders)
    

t = Tester()

# Test is_valid_email
print("Testing is_valid_email:")
print(f"help@globex.com: {t.is_valid_email('help@globex.com')} (should be True)")
print(f"contact@acme.com: {t.is_valid_email('contact@acme.com')} (should be True)")
print(f"random@email.com: {t.is_valid_email('random@email.com')} (should be False)")
print(f"None: {t.is_valid_email(None)} (should be False)")
print(f"empty string: {t.is_valid_email('')} (should be False)")
print(f"HELP@GLOBEX.COM: {t.is_valid_email('HELP@GLOBEX.COM')} (should be False - case sensitive)")
print()

# Test is_placeholder_email
print("Testing is_placeholder_email:")
print(f"None: {t.is_placeholder_email(None)} (should be True)")
print(f"empty string: {t.is_placeholder_email('')} (should be True)")
print(f"noemail: {t.is_placeholder_email('noemail')} (should be True)")
print(f"invalid@: {t.is_placeholder_email('invalid@')} (should be True)")
print(f"user@: {t.is_placeholder_email('user@')} (should be True)")
print(f"missing.com: {t.is_placeholder_email('missing.com')} (should be True)")
print(f"NOEMAIL: {t.is_placeholder_email('NOEMAIL')} (should be True - case insensitive)")
print(f"help@globex.com: {t.is_placeholder_email('help@globex.com')} (should be False)")
print(f"real@email.com: {t.is_placeholder_email('real@email.com')} (should be False)")
print()

Testing is_valid_email:
help@globex.com: True (should be True)
contact@acme.com: True (should be True)
random@email.com: False (should be False)
None: False (should be False)
empty string: False (should be False)
HELP@GLOBEX.COM: False (should be False - case sensitive)

Testing is_placeholder_email:
None: True (should be True)
empty string: True (should be True)
noemail: True (should be True)
invalid@: True (should be True)
user@: True (should be True)
missing.com: True (should be True)
NOEMAIL: True (should be True - case insensitive)
help@globex.com: False (should be False)
real@email.com: False (should be False)

