In [13]:
import pandas as pd
import numpy as np
from typing import Dict, List, Optional


In [None]:
class Cleansing:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        
        self.duplicate_groups = {
            'account_name': ['Account Name', 'account_name', 'AccountName'],
            'contact_email': ['Contact Email', 'contact_email'],
            'created_date': ['Created Date', 'created_date'],
            'lead_source': ['Lead Source', 'lead_source'],
            'opportunity_amount': ['Opportunity Amount', 'opportunity_amount'],
            'is_active': ['Is Active', 'is_active'],
            'sfdc_id': ['SFDC ID', 'sfdc_id'],
            'annual_revenue': ['Annual Revenue', 'annual_revenue']
        }
    
    
    def is_valid_email(self, email: str) -> bool:
        # Check if email is one of the valid emails
        if pd.isna(email) or email is None:
            return False
        return email in ['help@globex.com', 'contact@acme.com']
    
    def is_placeholder_email(self, email: str) -> bool:
        # Check if email is a placeholder 
        if pd.isna(email) or email is None or email == '':
            return True
        placeholders = ['noemail', 'invalid@', 'user@', 'missing.com', 'placeholder']
        return any(placeholder in str(email).lower() for placeholder in placeholders)
    
    def get_corresponding_account(self, email: str) -> Optional[str]:
        # Get the corresponding account name for a valid email
        if email == 'help@globex.com':
            return 'Globex'
        elif email == 'contact@acme.com':
            return 'Acme Corp'
        return None
    
    def generate_email_for_account(self, account_name: str) -> Optional[str]:
        # Generate corresponding email for Globex or Acme Corp
        if account_name == 'Globex':
            return 'help@globex.com'
        elif account_name == 'Acme Corp':
            return 'contact@acme.com'
        return None
    
    def consolidate_account_and_email(self) -> pd.DataFrame:

        result_df = self.df.copy()
        
        result_df['consolidated_account_name'] = None
        result_df['consolidated_contact_email'] = None
        
        account_columns = ['account_name', 'AccountName', 'Account Name']
        
        for idx, row in result_df.iterrows():

            contact_email_lower_val = row.get('contact_email')
            contact_email_val = row.get('Contact Email')
            
            contact_email_lower_valid = self.is_valid_email(contact_email_lower_val)
            contact_email_valid = self.is_valid_email(contact_email_val)
            
            account_values = {}
            for col in account_columns:
                if col in result_df.columns:
                    account_values[col] = row.get(col)
            
            final_account = None
            final_email = None
            
            for col in account_columns:
                if (col in account_values and 
                    pd.notna(account_values[col]) and 
                    account_values[col] != ''):
                    final_account = account_values[col]
                    break
            
            # If no account found, derive from valid email (all accounts empty case)
            if final_account is None:
                # Check contact_email first (priority)
                if contact_email_lower_valid:
                    final_account = self.get_corresponding_account(contact_email_lower_val)
                    final_email = contact_email_lower_val
                # If contact_email not valid, check Contact Email
                elif contact_email_valid:
                    final_account = self.get_corresponding_account(contact_email_val)
                    final_email = contact_email_val
            
            if final_account is not None and final_email is None:
                email_matched = False
                
                if contact_email_lower_valid:
                    expected_account = self.get_corresponding_account(contact_email_lower_val)
                    if final_account == expected_account:
                        final_email = contact_email_lower_val
                        email_matched = True
                
                if not email_matched and contact_email_valid:
                    expected_account = self.get_corresponding_account(contact_email_val)
                    if final_account == expected_account:
                        final_email = contact_email_val
                        email_matched = True
                
                # If no email match but account is Globex or Acme Corp, generate email
                if not email_matched:
                    generated_email = self.generate_email_for_account(final_account)
                    if generated_email:
                        final_email = generated_email
            
            result_df.at[idx, 'consolidated_account_name'] = final_account
            result_df.at[idx, 'consolidated_contact_email'] = final_email
        
        return result_df

    def consolidate_created_date(self) -> pd.DataFrame:
    
        result_df = self.df.copy()
        
        result_df['consolidated_created_date'] = None
        
        created_date_columns = ['created_date', 'Created Date']
        
        for idx, row in result_df.iterrows():
            final_created_date = None
            
            for col in created_date_columns:
                if col in result_df.columns:
                    raw_date = row.get(col)
                    if (pd.isna(raw_date) or raw_date == '' or 
                        str(raw_date).lower() in ['nat', 'not_a_date', 'none']):
                        continue
                    
                    formatted_date = self.format_date_to_standard(raw_date)
                    if formatted_date:
                        final_created_date = formatted_date
                        break
            
            result_df.at[idx, 'consolidated_created_date'] = final_created_date
        
        return result_df

    def format_date_to_standard(self, date_value) -> Optional[str]:
        if pd.isna(date_value) or date_value is None or date_value == '':
            return None
        
        try:
            parsed_date = pd.to_datetime(date_value, errors='coerce')
            
            if pd.isna(parsed_date):
                return None
            
            return parsed_date.strftime('%Y-%m-%d')
        
        except (ValueError, TypeError):
            return None
    
    def consolidate_lead_source(self) -> pd.DataFrame:
        result_df = self.df.copy()
        
        result_df['consolidated_lead_source'] = None
        
        lead_source_columns = ['lead_source', 'Lead Source']
        
        for idx, row in result_df.iterrows():
            final_lead_source = None
            
            for col in lead_source_columns:
                if col in result_df.columns:
                    lead_source_val = row.get(col)
                    if (pd.notna(lead_source_val) and 
                        lead_source_val != '' and 
                        str(lead_source_val).lower() not in ['nat', 'none', 'null']):
                        final_lead_source = lead_source_val
                        break
            
            result_df.at[idx, 'consolidated_lead_source'] = final_lead_source
        
        return result_df
        
    def create_clean_dataset(self) -> pd.DataFrame:
        consolidated_df = self.consolidate_account_and_email()
        
        self.df = consolidated_df
        
        consolidated_df = self.consolidate_created_date()
        
        self.df = consolidated_df
        
        consolidated_df = self.consolidate_lead_source()
        
        clean_df = consolidated_df.copy()
        
        clean_df['account_name'] = consolidated_df['consolidated_account_name']
        clean_df['contact_email'] = consolidated_df['consolidated_contact_email']
        clean_df['created_date'] = consolidated_df['consolidated_created_date']
        clean_df['lead_source'] = consolidated_df['consolidated_lead_source']
        
        columns_to_drop = [
            'consolidated_account_name', 'consolidated_contact_email', 'consolidated_created_date',
            'consolidated_lead_source', 'Account Name', 'AccountName', 'Contact Email', 
            'Created Date', 'Lead Source'
        ]
        
        columns_to_drop = [col for col in columns_to_drop if col in clean_df.columns]
        clean_df = clean_df.drop(columns=columns_to_drop)
        
        return clean_df
    
    
    
    

In [20]:
import pandas as pd
from typing import Optional

class DateTester:
    def format_date_to_standard(self, date_value) -> Optional[str]:
        """
        Format date to standard YYYY-MM-DD format
        Handles various input formats and returns None for invalid dates
        """
        if pd.isna(date_value) or date_value is None or date_value == '':
            return None
        
        try:
            # Convert to pandas datetime which handles many formats automatically
            parsed_date = pd.to_datetime(date_value, errors='coerce')
            
            # If parsing failed, return None
            if pd.isna(parsed_date):
                return None
            
            # Return in standard YYYY-MM-DD format
            return parsed_date.strftime('%Y-%m-%d')
        
        except (ValueError, TypeError):
            return None

    def is_valid_date_format(self, date_string) -> bool:
        """
        Check if date string is in the standard YYYY-MM-DD format
        """
        if pd.isna(date_string) or date_string is None or date_string == '':
            return False
        
        try:
            # Check if it matches YYYY-MM-DD pattern exactly
            if len(str(date_string)) == 10 and str(date_string).count('-') == 2:
                parts = str(date_string).split('-')
                if (len(parts[0]) == 4 and len(parts[1]) == 2 and len(parts[2]) == 2 and
                    parts[0].isdigit() and parts[1].isdigit() and parts[2].isdigit()):
                    # Also verify it's a valid date
                    pd.to_datetime(date_string, format='%Y-%m-%d', errors='raise')
                    return True
            return False
        except:
            return False


dt = DateTester()

# Test format_date_to_standard
print("Testing format_date_to_standard:")
print(f"'2023-01-15': {dt.format_date_to_standard('2023-01-15')} (should be 2023-01-15)")
print(f"'01/15/2023': {dt.format_date_to_standard('01/15/2023')} (should be 2023-01-15)")
print(f"'15-01-2023': {dt.format_date_to_standard('15-01-2023')} (should be 2023-01-15)")
print(f"'2023/01/15': {dt.format_date_to_standard('2023/01/15')} (should be 2023-01-15)")
print(f"'Jan 15, 2023': {dt.format_date_to_standard('Jan 15, 2023')} (should be 2023-01-15)")
print(f"'15 January 2023': {dt.format_date_to_standard('15 January 2023')} (should be 2023-01-15)")
print(f"'2023-1-5': {dt.format_date_to_standard('2023-1-5')} (should be 2023-01-05)")
print(f"None: {dt.format_date_to_standard(None)} (should be None)")
print(f"empty string: {dt.format_date_to_standard('')} (should be None)")
print(f"'invalid_date': {dt.format_date_to_standard('invalid_date')} (should be None)")
print(f"'2023-13-45': {dt.format_date_to_standard('2023-13-45')} (should be None - invalid date)")
print(f"'not a date': {dt.format_date_to_standard('not a date')} (should be None)")
print()

# Test is_valid_date_format
print("Testing is_valid_date_format:")
print(f"'2023-01-15': {dt.is_valid_date_format('2023-01-15')} (should be True)")
print(f"'2023-12-31': {dt.is_valid_date_format('2023-12-31')} (should be True)")
print(f"'2023-1-5': {dt.is_valid_date_format('2023-1-5')} (should be False - wrong format)")
print(f"'01/15/2023': {dt.is_valid_date_format('01/15/2023')} (should be False - wrong format)")
print(f"'2023/01/15': {dt.is_valid_date_format('2023/01/15')} (should be False - wrong format)")
print(f"'Jan 15, 2023': {dt.is_valid_date_format('Jan 15, 2023')} (should be False - wrong format)")
print(f"'2023-13-45': {dt.is_valid_date_format('2023-13-45')} (should be False - invalid date)")
print(f"None: {dt.is_valid_date_format(None)} (should be False)")
print(f"empty string: {dt.is_valid_date_format('')} (should be False)")
print(f"'not a date': {dt.is_valid_date_format('not a date')} (should be False)")
print(f"'2023-02-29': {dt.is_valid_date_format('2023-02-29')} (should be False - not a leap year)")
print(f"'2024-02-29': {dt.is_valid_date_format('2024-02-29')} (should be True - leap year)")
print()

# Test edge cases
print("Testing edge cases:")
print(f"'2023-01-01': {dt.format_date_to_standard('2023-01-01')} (should be 2023-01-01)")
print(f"'2023-12-31': {dt.format_date_to_standard('2023-12-31')} (should be 2023-12-31)")
print(f"'1900-01-01': {dt.format_date_to_standard('1900-01-01')} (should be 1900-01-01)")
print(f"'2100-12-31': {dt.format_date_to_standard('2100-12-31')} (should be 2100-12-31)")
print(f"'02/29/2024': {dt.format_date_to_standard('02/29/2024')} (should be 2024-02-29 - leap year)")
print(f"'02/29/2023': {dt.format_date_to_standard('02/29/2023')} (should be None - not leap year)")
print(f"123456789: {dt.format_date_to_standard(123456789)} (should handle numeric input)")
print()

Testing format_date_to_standard:
'2023-01-15': 2023-01-15 (should be 2023-01-15)
'01/15/2023': 2023-01-15 (should be 2023-01-15)
'15-01-2023': 2023-01-15 (should be 2023-01-15)
'2023/01/15': 2023-01-15 (should be 2023-01-15)
'Jan 15, 2023': 2023-01-15 (should be 2023-01-15)
'15 January 2023': 2023-01-15 (should be 2023-01-15)
'2023-1-5': 2023-01-05 (should be 2023-01-05)
None: None (should be None)
empty string: None (should be None)
'invalid_date': None (should be None)
'2023-13-45': None (should be None - invalid date)
'not a date': None (should be None)

Testing is_valid_date_format:
'2023-01-15': True (should be True)
'2023-12-31': True (should be True)
'2023-1-5': False (should be False - wrong format)
'01/15/2023': False (should be False - wrong format)
'2023/01/15': False (should be False - wrong format)
'Jan 15, 2023': False (should be False - wrong format)
'2023-13-45': False (should be False - invalid date)
None: False (should be False)
empty string: False (should be False)
'

  parsed_date = pd.to_datetime(date_value, errors='coerce')


In [23]:
df = pd.read_csv('data/DirtySalesforceData.csv')

print("Original Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

cleaner = Cleansing(df)
cleaner.create_clean_dataset()

clean_df = cleaner.create_clean_dataset()

clean_df.to_csv('CleanSalesforceData.csv', index=False)
print(f"\nClean dataset saved as 'CleanSalesforceData.csv'")
print(f"Clean dataset shape: {clean_df.shape}")

print(f"\nSample of consolidated data:")
print(clean_df.head(30))


Original Dataset Info:
Shape: (500, 30)
Columns: ['Account Name', 'account_name', 'AccountName', 'Contact Email', 'contact_email', 'Created Date', 'created_date', 'Lead Source', 'lead_source', 'Opportunity Amount', 'opportunity_amount', 'Is Active', 'is_active', 'SFDC ID', 'sfdc_id', 'Annual Revenue', 'annual_revenue', 'Last Activity', 'Custom Field', 'Region', 'Unnamed: 0', 'Unnamed: 21', 'Random Notes', 'Deal Score', 'Engagement Level', 'Num Calls', 'Time on Page (sec)', 'City', 'State', 'Country']

Clean dataset saved as 'CleanSalesforceData.csv'
Clean dataset shape: (500, 25)

Sample of consolidated data:
   account_name     contact_email created_date     lead_source  \
0     Acme Corp  contact@acme.com   2020-01-01      Trade Show   
1        Globex   help@globex.com   2022-12-01    Social Media   
2        Globex   help@globex.com         None  Email Campaign   
3     Acme Corp  contact@acme.com         None    Social Media   
4      Umbrella              None   2024-03-26       