In [10]:
import pandas as pd
import numpy as np
import re
from typing import Dict, List

class ColumnQualityAnalyzer:
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    def completeness_score(self, column: str) -> float:
        # How much of the column is filled
        if column not in self.df.columns:
            return 0.0
        return self.df[column].notna().mean()
    
    def quality_score(self, column: str) -> float:
        # How much of the data looks valid
        if column not in self.df.columns:
            return 0.0
        
        series = self.df[column].dropna()
        if len(series) == 0:
            return 0.0
        
        errors = ['not_a_date', 'noemail', 'invalid@', 'BAD_ID', 'N/A', 'NaT', 'ERROR']
        error_count = sum(1 for value in series.astype(str) 
                         if any(error in str(value) for error in errors))
        
        return 1.0 - (error_count / len(series))
    
    def format_consistency_score(self, column: str) -> float:
        # Check format consistency
        if column not in self.df.columns:
            return 0.0
            
        series = self.df[column].dropna()
        if len(series) == 0:
            return 0.0
        
        # Email format 
        if 'email' in column.lower():
            email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
            valid_emails = sum(1 for value in series.astype(str) 
                             if re.match(email_pattern, str(value)))
            return valid_emails / len(series)
        
        # Date format
        elif 'date' in column.lower() or 'created' in column.lower():
            valid_dates = 0
            for value in series:
                try:
                    # Try common date formats
                    pd.to_datetime(str(value))
                    valid_dates += 1
                except:
                    pass
            return valid_dates / len(series)
        
        # Boolean format
        elif 'active' in column.lower() or 'is_' in column.lower():
            valid_bools = sum(1 for value in series.astype(str).str.lower() 
                            if str(value).lower() in ['true', 'false', '1', '0', 'yes', 'no'])
            return valid_bools / len(series)
        
        return 1.0
    
    def data_type_consistency_score(self, column: str) -> float:
        if column not in self.df.columns:
            return 0.0
            
        series = self.df[column].dropna()
        if len(series) == 0:
            return 0.0
        
        # Amount/Revenue columns should be numeric
        if any(word in column.lower() for word in ['amount', 'revenue', 'score']):
            numeric_count = 0
            for value in series:
                try:
                    if isinstance(value, str):
                        if any(word in str(value).lower() for word in ['thousand', 'million', 'n/a']):
                            continue
                    float(value)
                    numeric_count += 1
                except:
                    pass
            return numeric_count / len(series)
        
        # ID columns should be strings/text
        elif 'id' in column.lower():
            # Most values should be convertible to string and non-empty
            valid_ids = sum(1 for value in series 
                          if str(value).strip() and str(value) != 'nan')
            return valid_ids / len(series)
        
        return 1.0
    
    def overall_score(self, column: str) -> float:
        completeness = self.completeness_score(column)
        quality = self.quality_score(column)
        format_consistency = self.format_consistency_score(column)
        data_type_consistency = self.data_type_consistency_score(column)
        
        return (completeness * 0.3) + (quality * 0.3) + (format_consistency * 0.2) + (data_type_consistency * 0.2)
    
    def compare_columns(self, columns: List[str]) -> Dict:
        results = {}
        
        for col in columns:
            if col in self.df.columns:
                results[col] = {
                    'completeness': self.completeness_score(col),
                    'quality': self.quality_score(col),
                    'format_consistency': self.format_consistency_score(col),
                    'data_type_consistency': self.data_type_consistency_score(col),
                    'overall': self.overall_score(col)
                }
        
        sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['overall'], reverse=True))
        best_column = next(iter(sorted_results)) if sorted_results else None
        
        return {
            'scores': sorted_results,
            'recommendation': best_column
        }
    
    def analyze_duplicates(self, duplicate_groups: Dict[str, List[str]]):
        print("COLUMN QUALITY ANALYSIS")
        print("=" * 50)
        
        for group_name, columns in duplicate_groups.items():

            existing_cols = [col for col in columns if col in self.df.columns]
            
            if len(existing_cols) <= 1:
                continue
                
            print(f"\n{group_name.upper()}:")
            print("-" * 30)
            
            comparison = self.compare_columns(existing_cols)
            recommended = comparison['recommendation']
            
            for col, scores in comparison['scores'].items():
                status = " ← BEST" if col == recommended else ""
                print(f"{col}{status}")
                print(f"  Overall: {scores['overall']:.2f}")
                print(f"  Complete: {scores['completeness']:.2f}")
                print(f"  Quality: {scores['quality']:.2f}")
                print(f"  Format: {scores['format_consistency']:.2f}")
                print(f"  DataType: {scores['data_type_consistency']:.2f}")
                print()

if __name__ == "__main__":
    
    duplicate_groups = {
        'account_name': ['Account Name', 'account_name', 'AccountName'],
        'contact_email': ['Contact Email', 'contact_email'],
        'created_date': ['Created Date', 'created_date'],
        'lead_source': ['Lead Source', 'lead_source'],
        'opportunity_amount': ['Opportunity Amount', 'opportunity_amount'],
        'is_active': ['Is Active', 'is_active'],
        'sfdc_id': ['SFDC ID', 'sfdc_id'],
        'annual_revenue': ['Annual Revenue', 'annual_revenue']
    }
    
    df = pd.read_csv('data/DirtySalesforceData.csv')
    analyzer = ColumnQualityAnalyzer(df)
    analyzer.analyze_duplicates(duplicate_groups)
    

COLUMN QUALITY ANALYSIS

ACCOUNT_NAME:
------------------------------
account_name ← BEST
  Overall: 0.95
  Complete: 0.82
  Quality: 1.00
  Format: 1.00
  DataType: 1.00

AccountName
  Overall: 0.91
  Complete: 0.70
  Quality: 1.00
  Format: 1.00
  DataType: 1.00

Account Name
  Overall: 0.88
  Complete: 0.60
  Quality: 1.00
  Format: 1.00
  DataType: 1.00


CONTACT_EMAIL:
------------------------------
contact_email ← BEST
  Overall: 0.80
  Complete: 0.76
  Quality: 1.00
  Format: 0.35
  DataType: 1.00

Contact Email
  Overall: 0.58
  Complete: 0.75
  Quality: 0.31
  Format: 0.31
  DataType: 1.00


CREATED_DATE:
------------------------------
created_date ← BEST
  Overall: 0.82
  Complete: 0.76
  Quality: 0.65
  Format: 1.00
  DataType: 1.00

Created Date
  Overall: 0.77
  Complete: 0.78
  Quality: 0.67
  Format: 0.67
  DataType: 1.00


LEAD_SOURCE:
------------------------------
lead_source ← BEST
  Overall: 0.94
  Complete: 0.80
  Quality: 1.00
  Format: 1.00
  DataType: 1.00

Lead