In [None]:
import os
import pandas as pd
from pathlib import Path
import openpyxl
from openpyxl import load_workbook
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Configuration
SOURCE_FOLDER = r"C:\Users\spt-admin\Desktop\PSWEPS_NEW"
OUTPUT_FILE = r"C:\Users\spt-admin\Desktop\PSWEPS_NEW\PSWEPS_MASTER_ANALYSIS.xlsx"

# Required columns (case-insensitive matching)
REQUIRED_COLUMNS = ['EMPLOYEE_NUMBER', 'FULL_NAME', 'SSNIT_NUMBER']

def log_message(message):
    """Print message to console"""
    print(message)

def has_required_columns(df):
    """Check if DataFrame has all required columns"""
    if df is None or df.empty:
        return False
    
    columns_upper = [str(col).upper().strip() for col in df.columns]
    
    for req_col in REQUIRED_COLUMNS:
        if req_col not in columns_upper:
            return False
    
    return True

def standardize_dataframe_columns(df):
    """Standardize column names to match required columns"""
    if df is None or df.empty:
        return None
    
    # Create mapping of uppercase columns to original columns
    col_mapping = {}
    for col in df.columns:
        col_upper = str(col).upper().strip()
        if col_upper in REQUIRED_COLUMNS:
            col_mapping[col] = col_upper
    
    # Rename columns
    df_renamed = df.rename(columns=col_mapping)
    
    return df_renamed

def collect_all_data(source_folder):
    """Collect all employee data from all Excel files"""
    
    all_data = []
    file_count = 0
    sheet_count = 0
    skipped_count = 0
    
    log_message("=" * 80)
    log_message("Starting data collection...")
    log_message("=" * 80)
    
    # Walk through all directories
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            ext = os.path.splitext(file)[1].lower()
            
            if ext in ['.xlsx', '.xls', '.xlsm']:
                file_path = os.path.join(root, file)
                
                # Skip the output file itself
                if file_path == OUTPUT_FILE:
                    continue
                
                try:
                    file_count += 1
                    log_message(f"\nProcessing: {file}")
                    
                    # Get all sheet names
                    xl_file = pd.ExcelFile(file_path)
                    
                    for sheet_name in xl_file.sheet_names:
                        try:
                            # Read sheet
                            df = pd.read_excel(file_path, sheet_name=sheet_name)
                            
                            # Check if has required columns
                            if has_required_columns(df):
                                # Standardize column names
                                df = standardize_dataframe_columns(df)
                                
                                # Keep only required columns
                                df = df[REQUIRED_COLUMNS]
                                
                                # Add source tracking columns
                                df['SOURCE_FILE'] = file
                                df['SOURCE_SHEET'] = sheet_name
                                
                                # Remove empty rows
                                df = df.dropna(how='all', subset=REQUIRED_COLUMNS)
                                
                                all_data.append(df)
                                sheet_count += 1
                                log_message(f"  ✓ Sheet '{sheet_name}': {len(df)} rows collected")
                            else:
                                skipped_count += 1
                                log_message(f"  ✗ Sheet '{sheet_name}': Skipped (missing required columns)")
                        
                        except Exception as e:
                            skipped_count += 1
                            log_message(f"  ✗ Sheet '{sheet_name}': Error - {str(e)}")
                
                except Exception as e:
                    log_message(f"  ERROR reading file: {str(e)}")
    
    log_message("\n" + "=" * 80)
    log_message(f"Collection Summary:")
    log_message(f"  Files processed: {file_count}")
    log_message(f"  Sheets collected: {sheet_count}")
    log_message(f"  Sheets skipped: {skipped_count}")
    log_message("=" * 80 + "\n")
    
    if not all_data:
        return None
    
    # Combine all data
    master_df = pd.concat(all_data, ignore_index=True)
    return master_df

def normalize_name(name):
    """Normalize name for comparison"""
    if pd.isna(name):
        return ""
    return str(name).upper().strip().replace(',', '').replace('.', '')

def analyze_duplicate_names(master_df):
    """Find duplicate names using fuzzy matching"""
    log_message("Analyzing duplicate names (fuzzy matching)...")
    
    # Create normalized names
    master_df['NAME_NORMALIZED'] = master_df['FULL_NAME'].apply(normalize_name)
    
    # Group by normalized name
    name_groups = master_df.groupby('NAME_NORMALIZED')
    
    duplicates = []
    
    for name, group in name_groups:
        if len(group) > 1:
            # Check if they have different Employee Numbers or SSNIT
            unique_emp = group['EMPLOYEE_NUMBER'].nunique()
            unique_ssnit = group['SSNIT_NUMBER'].nunique()
            
            if unique_emp > 1 or unique_ssnit > 1:
                # This is a potential duplicate
                for idx, row in group.iterrows():
                    duplicates.append({
                        'FULL_NAME': row['FULL_NAME'],
                        'EMPLOYEE_NUMBER': row['EMPLOYEE_NUMBER'],
                        'SSNIT_NUMBER': row['SSNIT_NUMBER'],
                        'SOURCE_FILE': row['SOURCE_FILE'],
                        'SOURCE_SHEET': row['SOURCE_SHEET'],
                        'DUPLICATE_COUNT': len(group),
                        'UNIQUE_EMP_NUMBERS': unique_emp,
                        'UNIQUE_SSNIT': unique_ssnit
                    })
    
    if duplicates:
        dup_df = pd.DataFrame(duplicates)
        dup_df = dup_df.sort_values(['FULL_NAME', 'EMPLOYEE_NUMBER'])
        return dup_df
    else:
        return pd.DataFrame()

def analyze_duplicate_employee_numbers(master_df):
    """Find duplicate Employee Numbers"""
    log_message("Analyzing duplicate Employee Numbers...")
    
    emp_groups = master_df.groupby('EMPLOYEE_NUMBER')
    
    duplicates = []
    
    for emp_num, group in emp_groups:
        if len(group) > 1:
            # Check if they have different names or SSNIT
            unique_names = group['FULL_NAME'].nunique()
            unique_ssnit = group['SSNIT_NUMBER'].nunique()
            
            if unique_names > 1 or unique_ssnit > 1:
                for idx, row in group.iterrows():
                    duplicates.append({
                        'EMPLOYEE_NUMBER': row['EMPLOYEE_NUMBER'],
                        'FULL_NAME': row['FULL_NAME'],
                        'SSNIT_NUMBER': row['SSNIT_NUMBER'],
                        'SOURCE_FILE': row['SOURCE_FILE'],
                        'SOURCE_SHEET': row['SOURCE_SHEET'],
                        'DUPLICATE_COUNT': len(group),
                        'UNIQUE_NAMES': unique_names,
                        'UNIQUE_SSNIT': unique_ssnit
                    })
    
    if duplicates:
        dup_df = pd.DataFrame(duplicates)
        dup_df = dup_df.sort_values(['EMPLOYEE_NUMBER', 'FULL_NAME'])
        return dup_df
    else:
        return pd.DataFrame()

def analyze_duplicate_ssnit(master_df):
    """Find duplicate SSNIT Numbers"""
    log_message("Analyzing duplicate SSNIT Numbers...")
    
    # Remove rows with missing SSNIT
    df_with_ssnit = master_df[master_df['SSNIT_NUMBER'].notna()]
    
    ssnit_groups = df_with_ssnit.groupby('SSNIT_NUMBER')
    
    duplicates = []
    
    for ssnit, group in ssnit_groups:
        if len(group) > 1:
            # Check if they have different names or Employee Numbers
            unique_names = group['FULL_NAME'].nunique()
            unique_emp = group['EMPLOYEE_NUMBER'].nunique()
            
            if unique_names > 1 or unique_emp > 1:
                for idx, row in group.iterrows():
                    duplicates.append({
                        'SSNIT_NUMBER': row['SSNIT_NUMBER'],
                        'EMPLOYEE_NUMBER': row['EMPLOYEE_NUMBER'],
                        'FULL_NAME': row['FULL_NAME'],
                        'SOURCE_FILE': row['SOURCE_FILE'],
                        'SOURCE_SHEET': row['SOURCE_SHEET'],
                        'DUPLICATE_COUNT': len(group),
                        'UNIQUE_NAMES': unique_names,
                        'UNIQUE_EMP_NUMBERS': unique_emp
                    })
    
    if duplicates:
        dup_df = pd.DataFrame(duplicates)
        dup_df = dup_df.sort_values(['SSNIT_NUMBER', 'FULL_NAME'])
        return dup_df
    else:
        return pd.DataFrame()

def create_summary_statistics(master_df):
    """Create summary statistics"""
    log_message("Creating summary statistics...")
    
    stats = []
    
    # Overall statistics
    stats.append({
        'METRIC': 'Total Records',
        'VALUE': len(master_df)
    })
    
    stats.append({
        'METRIC': 'Unique Employee Numbers',
        'VALUE': master_df['EMPLOYEE_NUMBER'].nunique()
    })
    
    stats.append({
        'METRIC': 'Unique Names',
        'VALUE': master_df['FULL_NAME'].nunique()
    })
    
    stats.append({
        'METRIC': 'Unique SSNIT Numbers',
        'VALUE': master_df['SSNIT_NUMBER'].nunique()
    })
    
    stats.append({
        'METRIC': 'Unique Source Files',
        'VALUE': master_df['SOURCE_FILE'].nunique()
    })
    
    stats.append({
        'METRIC': 'Total Source Sheets',
        'VALUE': master_df['SOURCE_SHEET'].nunique()
    })
    
    # Records by source file
    file_counts = master_df['SOURCE_FILE'].value_counts().reset_index()
    file_counts.columns = ['SOURCE_FILE', 'RECORD_COUNT']
    
    stats_df = pd.DataFrame(stats)
    
    return stats_df, file_counts

def analyze_data_quality(master_df):
    """Analyze data quality issues"""
    log_message("Analyzing data quality...")
    
    issues = []
    
    # Missing Employee Numbers
    missing_emp = master_df[master_df['EMPLOYEE_NUMBER'].isna()]
    if len(missing_emp) > 0:
        issues.append({
            'ISSUE_TYPE': 'Missing Employee Number',
            'COUNT': len(missing_emp),
            'SAMPLE_FILE': missing_emp.iloc[0]['SOURCE_FILE'] if len(missing_emp) > 0 else ''
        })
    
    # Missing Names
    missing_name = master_df[master_df['FULL_NAME'].isna()]
    if len(missing_name) > 0:
        issues.append({
            'ISSUE_TYPE': 'Missing Name',
            'COUNT': len(missing_name),
            'SAMPLE_FILE': missing_name.iloc[0]['SOURCE_FILE'] if len(missing_name) > 0 else ''
        })
    
    # Missing SSNIT
    missing_ssnit = master_df[master_df['SSNIT_NUMBER'].isna()]
    if len(missing_ssnit) > 0:
        issues.append({
            'ISSUE_TYPE': 'Missing SSNIT Number',
            'COUNT': len(missing_ssnit),
            'SAMPLE_FILE': missing_ssnit.iloc[0]['SOURCE_FILE'] if len(missing_ssnit) > 0 else ''
        })
    
    # Short names (potential data quality issue)
    short_names = master_df[master_df['FULL_NAME'].str.len() < 3]
    if len(short_names) > 0:
        issues.append({
            'ISSUE_TYPE': 'Very Short Name (< 3 chars)',
            'COUNT': len(short_names),
            'SAMPLE_FILE': short_names.iloc[0]['SOURCE_FILE'] if len(short_names) > 0 else ''
        })
    
    if issues:
        return pd.DataFrame(issues)
    else:
        return pd.DataFrame({'ISSUE_TYPE': ['No Issues Found'], 'COUNT': [0], 'SAMPLE_FILE': ['']})

def main():
    """Main execution function"""
    
    log_message("\n" + "=" * 80)
    log_message("MASTER CONSOLIDATION & ANALYSIS")
    log_message("=" * 80 + "\n")
    
    # Step 1: Collect all data
    master_df = collect_all_data(SOURCE_FOLDER)
    
    if master_df is None or master_df.empty:
        log_message("ERROR: No data collected. Exiting.")
        return
    
    log_message(f"Total records collected: {len(master_df)}\n")
    
    # Step 2: Perform analyses
    log_message("Performing analyses...")
    
    dup_names_df = analyze_duplicate_names(master_df)
    dup_emp_df = analyze_duplicate_employee_numbers(master_df)
    dup_ssnit_df = analyze_duplicate_ssnit(master_df)
    stats_df, file_counts_df = create_summary_statistics(master_df)
    quality_df = analyze_data_quality(master_df)
    
    # Step 3: Write to Excel
    log_message(f"\nWriting results to: {OUTPUT_FILE}")
    
    with pd.ExcelWriter(OUTPUT_FILE, engine='openpyxl') as writer:
        # Sheet 1: Master Data
        master_df.to_excel(writer, sheet_name='Master Data', index=False)
        
        # Sheet 2: Duplicate Names
        if not dup_names_df.empty:
            dup_names_df.to_excel(writer, sheet_name='Duplicate Names', index=False)
        else:
            pd.DataFrame({'MESSAGE': ['No duplicate names found']}).to_excel(writer, sheet_name='Duplicate Names', index=False)
        
        # Sheet 3: Duplicate Employee Numbers
        if not dup_emp_df.empty:
            dup_emp_df.to_excel(writer, sheet_name='Duplicate Employee Numbers', index=False)
        else:
            pd.DataFrame({'MESSAGE': ['No duplicate employee numbers found']}).to_excel(writer, sheet_name='Duplicate Employee Numbers', index=False)
        
        # Sheet 4: Duplicate SSNIT
        if not dup_ssnit_df.empty:
            dup_ssnit_df.to_excel(writer, sheet_name='Duplicate SSNIT', index=False)
        else:
            pd.DataFrame({'MESSAGE': ['No duplicate SSNIT numbers found']}).to_excel(writer, sheet_name='Duplicate SSNIT', index=False)
        
        # Sheet 5: Summary Statistics
        stats_df.to_excel(writer, sheet_name='Summary Statistics', index=False)
        
        # Add file counts
        file_counts_df.to_excel(writer, sheet_name='Records by File', index=False)
        
        # Sheet 6: Data Quality Issues
        quality_df.to_excel(writer, sheet_name='Data Quality Issues', index=False)
    
    log_message(" Excel file created successfully!")
    
    # Final summary
    log_message("\n" + "=" * 80)
    log_message("ANALYSIS COMPLETE")
    log_message("=" * 80)
    log_message(f"Master records: {len(master_df)}")
    log_message(f"Duplicate names found: {len(dup_names_df)}")
    log_message(f"Duplicate employee numbers: {len(dup_emp_df)}")
    log_message(f"Duplicate SSNIT numbers: {len(dup_ssnit_df)}")
    log_message(f"\nOutput file: {OUTPUT_FILE}")
    log_message("=" * 80 + "\n")

if __name__ == "__main__":
    main()