In [29]:
import pandas as pd
import numpy as np
from typing import Dict, List, Optional


In [None]:
class ColumnPrioritizer:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        
        # Hard-coded duplicate column groups based on the Salesforce dataset
        self.duplicate_groups = {
            'account_name': ['Account Name', 'account_name', 'AccountName'],
            'contact_email': ['Contact Email', 'contact_email'],
            'created_date': ['Created Date', 'created_date'],
            'lead_source': ['Lead Source', 'lead_source'],
            'opportunity_amount': ['Opportunity Amount', 'opportunity_amount'],
            'is_active': ['Is Active', 'is_active'],
            'sfdc_id': ['SFDC ID', 'sfdc_id'],
            'annual_revenue': ['Annual Revenue', 'annual_revenue']
        }
    
    def completeness_score(self, column_name: str) -> float:
        # Calculate completeness score for a column (0-1).
        if column_name not in self.df.columns:
            return 0.0
        return self.df[column_name].notna().sum() / len(self.df)
    
    def quality_score(self, column_name: str) -> float:
        ##Calculate quality score for a column (0-1).
        if column_name not in self.df.columns:
            return 0.0
        
        series = self.df[column_name].dropna()
        if len(series) == 0:
            return 0.0
        
        # Check for error indicators
        error_indicators = ['not_a_date', 'noemail', 'invalid@', 'BAD_ID', 'N/A', 'NaT']
        error_count = sum(1 for value in series.astype(str) 
                         if any(indicator in str(value) for indicator in error_indicators))
        
        return max(0.0, 1.0 - (error_count / len(series)))

    def naming_score(self, column_name: str) -> float:
        """Score column based on naming convention (0-1)."""
        score = 0.0
        
        # Prefer snake_case
        if '_' in column_name and column_name.islower():
            score += 0.5
        # Prefer descriptive names
        if len(column_name) > 5:
            score += 0.3
        # Prefer standard patterns
        if any(pattern in column_name.lower() for pattern in ['name', 'email', 'date', 'id', 'amount']):
            score += 0.2
        
        return min(1.0, score)
    
    def consistency_score(self, column_name: str) -> float:
        """Score column based on data consistency (0-1)."""
        if column_name not in self.df.columns:
            return 0.0
        
        series = self.df[column_name].dropna()
        if len(series) == 0:
            return 0.0
        
        # Count unique non-null values vs total non-null values
        # More unique values relative to total = less consistent
        unique_ratio = len(series.unique()) / len(series)
        
        # Invert so higher consistency = higher score
        return max(0.0, 1.0 - unique_ratio)
    
    def is_valid_email(self, email: str) -> bool:
        # Check if email is one of the valid emails: help@globex.com or contact@acme.com
        if pd.isna(email) or email is None:
            return False
        return email in ['help@globex.com', 'contact@acme.com']
    
    def is_placeholder_email(self, email: str) -> bool:
        # Check if email is a placeholder (empty, None, or obvious placeholder)"""
        if pd.isna(email) or email is None or email == '':
            return True
        # Common placeholder patterns
        placeholders = ['noemail', 'invalid@', 'user@', 'missing.com', 'placeholder']
        return any(placeholder in str(email).lower() for placeholder in placeholders)
    
    def get_corresponding_account(self, email: str) -> Optional[str]:
        # Get the corresponding account name for a valid email
        if email == 'help@globex.com':
            return 'Globex'
        elif email == 'contact@acme.com':
            return 'Acme Corp'
        return None
    
    def generate_email_for_account(self, account_name: str) -> Optional[str]:
        # Generate corresponding email for Globex or Acme Corp accounts
        if account_name == 'Globex':
            return 'help@globex.com'
        elif account_name == 'Acme Corp':
            return 'contact@acme.com'
        return None
    
    def consolidate_account_and_email(self) -> pd.DataFrame:

        result_df = self.df.copy()
        
        result_df['consolidated_account_name'] = None
        result_df['consolidated_contact_email'] = None
        
        # Priority order
        account_columns = ['account_name', 'AccountName', 'Account Name']
        
        for idx, row in result_df.iterrows():

            contact_email_lower_val = row.get('contact_email')  # Priority email
            contact_email_val = row.get('Contact Email')        # Secondary email
            
            # Check which emails are valid
            contact_email_lower_valid = self.is_valid_email(contact_email_lower_val)
            contact_email_valid = self.is_valid_email(contact_email_val)
            
            # Get account values in priority order
            account_values = {}
            for col in account_columns:
                if col in result_df.columns:
                    account_values[col] = row.get(col)
            
            final_account = None
            final_email = None
            
            # Get account using STRICT priority (account_name -> AccountName -> Account Name)
            for col in account_columns:
                if (col in account_values and 
                    pd.notna(account_values[col]) and 
                    account_values[col] != ''):
                    final_account = account_values[col]
                    break
            
            # If no account found, derive from valid email (all accounts empty case)
            if final_account is None:
                # Check contact_email first (priority)
                if contact_email_lower_valid:
                    final_account = self.get_corresponding_account(contact_email_lower_val)
                    final_email = contact_email_lower_val
                # If contact_email not valid, check Contact Email
                elif contact_email_valid:
                    final_account = self.get_corresponding_account(contact_email_val)
                    final_email = contact_email_val
            
            # Set email based on chosen account (only if we have an account)
            if final_account is not None and final_email is None:
                email_matched = False
                
                if contact_email_lower_valid:
                    expected_account = self.get_corresponding_account(contact_email_lower_val)
                    if final_account == expected_account:
                        final_email = contact_email_lower_val
                        email_matched = True
                
                if not email_matched and contact_email_valid:
                    expected_account = self.get_corresponding_account(contact_email_val)
                    if final_account == expected_account:
                        final_email = contact_email_val
                        email_matched = True
                
                # If no email match but account is Globex or Acme Corp, generate email
                if not email_matched:
                    generated_email = self.generate_email_for_account(final_account)
                    if generated_email:
                        final_email = generated_email
            
            result_df.at[idx, 'consolidated_account_name'] = final_account
            result_df.at[idx, 'consolidated_contact_email'] = final_email
        
        return result_df
    
    def create_clean_dataset(self) -> pd.DataFrame:
        consolidated_df = self.consolidate_account_and_email()
        
        clean_df = consolidated_df.copy()
        
        clean_df['account_name'] = consolidated_df['consolidated_account_name']
        clean_df['contact_email'] = consolidated_df['consolidated_contact_email']
        
        columns_to_drop = [
            'consolidated_account_name', 'consolidated_contact_email',
            'Account Name', 'AccountName', 'Contact Email'
        ]
        
        columns_to_drop = [col for col in columns_to_drop if col in clean_df.columns]
        clean_df = clean_df.drop(columns=columns_to_drop)
        
        return clean_df
    
    
    
    

In [42]:
df = pd.read_csv('data/DirtySalesforceData.csv')
prioritizer = ColumnPrioritizer(df)

account_columns = ['Account Name', 'account_name', 'AccountName']
print("Account Name Columns Analysis:")
for col in account_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

contact_columns = ['Contact Email', 'contact_email']
print("Contact Email Columns Analysis:")
for col in contact_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

created_date_columns = ['Created Date', 'created_date']
print("Created Date Columns Analysis:")
for col in created_date_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

lead_source_columns = ['Lead Source', 'lead_source']
print("Lead Source Columns Analysis:")
for col in lead_source_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

opportunity_columns = ['Opportunity Name', 'opportunity_name']
print("Opportunity Name Columns Analysis:")
for col in opportunity_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

is_active_columns = ['Is Active', 'is_active']
print("Is Active Columns Analysis:")
for col in is_active_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

sfdc_columns = ['SFDC ID', 'sfdc_id']
print("Sfdc Id Columns Analysis:")
for col in sfdc_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")

annual_revenue_columns = ['Annual Revenue', 'annual_revenue']
print("Annual Revenue Columns Analysis:")
for col in annual_revenue_columns:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Completeness: {prioritizer.completeness_score(col):.3f}")







Account Name Columns Analysis:

Account Name:
  Completeness: 0.596

account_name:
  Completeness: 0.822

AccountName:
  Completeness: 0.702
Contact Email Columns Analysis:

Contact Email:
  Completeness: 0.754

contact_email:
  Completeness: 0.764
Created Date Columns Analysis:

Created Date:
  Completeness: 0.776

created_date:
  Completeness: 0.758
Lead Source Columns Analysis:

Lead Source:
  Completeness: 0.572

lead_source:
  Completeness: 0.798
Opportunity Name Columns Analysis:
Is Active Columns Analysis:

Is Active:
  Completeness: 0.872

is_active:
  Completeness: 0.824
Sfdc Id Columns Analysis:

SFDC ID:
  Completeness: 0.634

sfdc_id:
  Completeness: 0.810
Annual Revenue Columns Analysis:

Annual Revenue:
  Completeness: 0.854

annual_revenue:
  Completeness: 0.742


In [26]:
class Tester:
    def is_valid_email(self, email):
        if pd.isna(email) or email is None:
            return False
        return email in ['help@globex.com', 'contact@acme.com']
    
    def is_placeholder_email(self, email):
        if pd.isna(email) or email is None or email == '':
            return True
        placeholders = ['noemail', 'invalid@', 'user@', 'missing.com', 'placeholder']
        return any(placeholder in str(email).lower() for placeholder in placeholders)
    
    def get_corresponding_account(self, email):
        if email == 'help@globex.com':
            return 'Globex'
        elif email == 'contact@acme.com':
            return 'Acme Corp'
        return None
    
    def generate_email_for_account(self, account_name):
        if account_name == 'Globex':
            return 'help@globex.com'
        elif account_name == 'Acme Corp':
            return 'contact@acme.com'
        return None


t = Tester()

# Test is_valid_email
print("Testing is_valid_email:")
print(f"help@globex.com: {t.is_valid_email('help@globex.com')} (should be True)")
print(f"contact@acme.com: {t.is_valid_email('contact@acme.com')} (should be True)")
print(f"random@email.com: {t.is_valid_email('random@email.com')} (should be False)")
print(f"None: {t.is_valid_email(None)} (should be False)")
print(f"empty string: {t.is_valid_email('')} (should be False)")
print(f"HELP@GLOBEX.COM: {t.is_valid_email('HELP@GLOBEX.COM')} (should be False - case sensitive)")
print()

# Test is_placeholder_email
print("Testing is_placeholder_email:")
print(f"None: {t.is_placeholder_email(None)} (should be True)")
print(f"empty string: {t.is_placeholder_email('')} (should be True)")
print(f"noemail: {t.is_placeholder_email('noemail')} (should be True)")
print(f"invalid@: {t.is_placeholder_email('invalid@')} (should be True)")
print(f"user@: {t.is_placeholder_email('user@')} (should be True)")
print(f"missing.com: {t.is_placeholder_email('missing.com')} (should be True)")
print(f"NOEMAIL: {t.is_placeholder_email('NOEMAIL')} (should be True - case insensitive)")
print(f"help@globex.com: {t.is_placeholder_email('help@globex.com')} (should be False)")
print(f"real@email.com: {t.is_placeholder_email('real@email.com')} (should be False)")
print()

# Test get_corresponding_account
print("Testing get_corresponding_account:")
print(f"help@globex.com: {t.get_corresponding_account('help@globex.com')} (should be Globex)")
print(f"contact@acme.com: {t.get_corresponding_account('contact@acme.com')} (should be Acme Corp)")
print(f"random@email.com: {t.get_corresponding_account('random@email.com')} (should be None)")
print(f"help@acme.com: {t.get_corresponding_account('help@acme.com')} (should be None)")
print(f"None: {t.get_corresponding_account(None)} (should be None)")
print()

# Test generate_email_for_account
print("Testing generate_email_for_account:")
print(f"Globex: {t.generate_email_for_account('Globex')} (should be help@globex.com)")
print(f"Acme Corp: {t.generate_email_for_account('Acme Corp')} (should be contact@acme.com)")
print(f"Initech: {t.generate_email_for_account('Initech')} (should be None)")
print(f"GLOBEX: {t.generate_email_for_account('GLOBEX')} (should be None - case sensitive)")
print(f"None: {t.generate_email_for_account(None)} (should be None)")
print()

Testing is_valid_email:
help@globex.com: True (should be True)
contact@acme.com: True (should be True)
random@email.com: False (should be False)
None: False (should be False)
empty string: False (should be False)
HELP@GLOBEX.COM: False (should be False - case sensitive)

Testing is_placeholder_email:
None: True (should be True)
empty string: True (should be True)
noemail: True (should be True)
invalid@: True (should be True)
user@: True (should be True)
missing.com: True (should be True)
NOEMAIL: True (should be True - case insensitive)
help@globex.com: False (should be False)
real@email.com: False (should be False)

Testing get_corresponding_account:
help@globex.com: Globex (should be Globex)
contact@acme.com: Acme Corp (should be Acme Corp)
random@email.com: None (should be None)
help@acme.com: None (should be None)
None: None (should be None)

Testing generate_email_for_account:
Globex: help@globex.com (should be help@globex.com)
Acme Corp: contact@acme.com (should be contact@acme.c

In [41]:
df = pd.read_csv('data/DirtySalesforceData.csv')

print("Original Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

prioritizer = ColumnPrioritizer(df)

clean_df = prioritizer.create_clean_dataset()

clean_df.to_csv('CleanSalesforceData.csv', index=False)
print(f"\nClean dataset saved as 'CleanSalesforceData.csv'")
print(f"Clean dataset shape: {clean_df.shape}")

print(f"\nSample of consolidated data:")
print(clean_df.head())


Original Dataset Info:
Shape: (500, 30)
Columns: ['Account Name', 'account_name', 'AccountName', 'Contact Email', 'contact_email', 'Created Date', 'created_date', 'Lead Source', 'lead_source', 'Opportunity Amount', 'opportunity_amount', 'Is Active', 'is_active', 'SFDC ID', 'sfdc_id', 'Annual Revenue', 'annual_revenue', 'Last Activity', 'Custom Field', 'Region', 'Unnamed: 0', 'Unnamed: 21', 'Random Notes', 'Deal Score', 'Engagement Level', 'Num Calls', 'Time on Page (sec)', 'City', 'State', 'Country']

Clean dataset saved as 'CleanSalesforceData.csv'
Clean dataset shape: (500, 27)

Sample of consolidated data:
  account_name     contact_email Created Date                created_date  \
0    Acme Corp  contact@acme.com   01-01-2020                         NaN   
1       Globex   help@globex.com   not_a_date                  2022/12/01   
2       Globex   help@globex.com   not_a_date                         NaT   
3    Acme Corp  contact@acme.com   not_a_date                         NaT  