In [None]:
import pandas as pd
import numpy as np
from typing import Dict, List

class ColumnQualityAnalyzer:
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    def completeness_score(self, column: str) -> float:
        # How much of the column is filled
        if column not in self.df.columns:
            return 0.0
        return self.df[column].notna().mean()
    
    def quality_score(self, column: str) -> float:
        # How much of the data looks valid
        if column not in self.df.columns:
            return 0.0
        
        series = self.df[column].dropna()
        if len(series) == 0:
            return 0.0
        
        # Look for common error patterns
        errors = ['not_a_date', 'noemail', 'invalid@', 'BAD_ID', 'N/A', 'NaT', 'ERROR']
        error_count = sum(1 for value in series.astype(str) 
                         if any(error in str(value) for error in errors))
        
        return 1.0 - (error_count / len(series))
    
    def overall_score(self, column: str) -> float:
        completeness = self.completeness_score(column)
        quality = self.quality_score(column)
        
        # Equal weight between completeness and quality
        return (completeness * 0.5) + (quality * 0.5)
    
    def compare_columns(self, columns: List[str]) -> Dict:
        results = {}
        
        for col in columns:
            if col in self.df.columns:
                results[col] = {
                    'completeness': self.completeness_score(col),
                    'quality': self.quality_score(col),
                    'overall': self.overall_score(col)
                }
        
        # Sort by overall score
        sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['overall'], reverse=True))
        best_column = next(iter(sorted_results)) if sorted_results else None
        
        return {
            'scores': sorted_results,
            'recommendation': best_column
        }
    
    def analyze_duplicates(self, duplicate_groups: Dict[str, List[str]]):
        print("COLUMN QUALITY ANALYSIS")
        print("=" * 50)
        
        for group_name, columns in duplicate_groups.items():

            existing_cols = [col for col in columns if col in self.df.columns]
            
            if len(existing_cols) <= 1:
                continue
                
            print(f"\n{group_name.upper()}:")
            print("-" * 30)
            
            comparison = self.compare_columns(existing_cols)
            recommended = comparison['recommendation']
            
            for col, scores in comparison['scores'].items():
                status = " ← BEST" if col == recommended else ""
                print(f"{col}{status}")
                print(f"  Overall: {scores['overall']:.2f}")
                print(f"  Complete: {scores['completeness']:.2f}")
                print(f"  Quality: {scores['quality']:.2f}")
                print()

if __name__ == "__main__":
    
    duplicate_groups = {
        'account_name': ['Account Name', 'account_name', 'AccountName'],
        'contact_email': ['Contact Email', 'contact_email'],
        'created_date': ['Created Date', 'created_date'],
        'lead_source': ['Lead Source', 'lead_source'],
        'opportunity_amount': ['Opportunity Amount', 'opportunity_amount'],
        'is_active': ['Is Active', 'is_active'],
        'sfdc_id': ['SFDC ID', 'sfdc_id'],
        'annual_revenue': ['Annual Revenue', 'annual_revenue']
    }
    
    # Load your data and analyze
    df = pd.read_csv('data/DirtySalesforceData.csv')
    analyzer = ColumnQualityAnalyzer(df)
    analyzer.analyze_duplicates(duplicate_groups)
    

COLUMN QUALITY ANALYSIS

ACCOUNT_NAME:
------------------------------
account_name ← BEST
  Overall: 0.91
  Complete: 0.82
  Quality: 1.00

AccountName
  Overall: 0.85
  Complete: 0.70
  Quality: 1.00

Account Name
  Overall: 0.80
  Complete: 0.60
  Quality: 1.00


CONTACT_EMAIL:
------------------------------
contact_email ← BEST
  Overall: 0.88
  Complete: 0.76
  Quality: 1.00

Contact Email
  Overall: 0.53
  Complete: 0.75
  Quality: 0.31


CREATED_DATE:
------------------------------
Created Date ← BEST
  Overall: 0.72
  Complete: 0.78
  Quality: 0.67

created_date
  Overall: 0.70
  Complete: 0.76
  Quality: 0.65


LEAD_SOURCE:
------------------------------
lead_source ← BEST
  Overall: 0.90
  Complete: 0.80
  Quality: 1.00

Lead Source
  Overall: 0.79
  Complete: 0.57
  Quality: 1.00


OPPORTUNITY_AMOUNT:
------------------------------
Opportunity Amount ← BEST
  Overall: 0.83
  Complete: 0.66
  Quality: 1.00

opportunity_amount
  Overall: 0.79
  Complete: 0.59
  Quality: 1.00


