In [None]:
import os
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

# File paths
CORRECTED_MASTER_FILE = r"C:\Users\spt-admin\Desktop\PSWEPS_NEWADD\NEW FOLDERS\Master_Data_Corrected.csv"
SCHEDULES_FOLDER = r"C:\Users\spt-admin\Desktop\PSWEPS_NEWSCHEDULES_DONE"
OUTPUT_FOLDER = r"C:\Users\spt-admin\Desktop\PSWEPS_SCHEDULES"

# Create output folder if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created output folder: {OUTPUT_FOLDER}")

print("=" * 80)
print("EMPLOYEE NUMBER UPDATE - EMPLOYEE# + SSNIT MATCHING")
print("=" * 80)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# =============================================================================
# STEP 1: LOAD MASTER DATA AND CREATE MAPPING
# =============================================================================

print("Step 1: Loading master data and creating mapping dictionary...")

try:
    master_df = pd.read_csv(CORRECTED_MASTER_FILE, dtype=str)
    master_df.columns = master_df.columns.str.strip()
    
    # Fill NaN values
    master_df['Employee_Number'] = master_df['Employee_Number'].fillna('').str.strip()
    master_df['SSNIT_Number'] = master_df['SSNIT_Number'].fillna('').str.strip()
    master_df['Original_Employee_Number'] = master_df['Original_Employee_Number'].fillna('').str.strip()
    master_df['Status'] = master_df['Status'].fillna('').str.strip()
    
    # Create mapping dictionary
    # Key: (Original_Employee_Number, SSNIT_Number)
    # Value: {new_emp_num, status, employee_name}
    mapping = {}
    
    for _, row in master_df.iterrows():
        original_emp = row['Original_Employee_Number']
        ssnit = row['SSNIT_Number']
        
        # Skip if either is empty or unknown
        if original_emp in ['', 'EMPTY_EMP', 'UNKNOWN_EMP', 'UNKNOWN']:
            continue
        if ssnit in ['', 'UNKNOWN']:
            continue
        
        key = (original_emp, ssnit)
        mapping[key] = {
            'new_emp_num': row['Employee_Number'],
            'status': row['Status'],
            'employee_name': row.get('Employee_Name', '')
        }
    
    # Count statuses
    kept_count = sum(1 for v in mapping.values() if 'Kept' in v['status'])
    reassigned_count = sum(1 for v in mapping.values() if 'Reassigned' in v['status'])
    
    print(f"Loaded {len(mapping)} mappings from master data")
    print(f"  - Status 'Kept': {kept_count}")
    print(f"  - Status 'Reassigned': {reassigned_count}")
    print(f"  - Empty/Unknown employee numbers: Excluded from mapping")
    
except Exception as e:
    print(f"ERROR loading master file: {str(e)}")
    exit()

# =============================================================================
# PRE-VALIDATION: CHECK FOR DUPLICATE (EMPLOYEE#, SSNIT) PAIRS IN MASTER
# =============================================================================

print("\nPre-validation: Checking for duplicate (Employee#, SSNIT) pairs in master...")

# Check for duplicates in master data
master_check = master_df[
    (master_df['Original_Employee_Number'] != '') & 
    (master_df['Original_Employee_Number'] != 'EMPTY_EMP') &
    (master_df['Original_Employee_Number'] != 'UNKNOWN_EMP') &
    (master_df['SSNIT_Number'] != '') &
    (master_df['SSNIT_Number'] != 'UNKNOWN')
].copy()

duplicate_check = master_check.groupby(['Original_Employee_Number', 'SSNIT_Number']).size()
duplicates = duplicate_check[duplicate_check > 1]

if len(duplicates) > 0:
    print(f"WARNING: Found {len(duplicates)} duplicate (Employee#, SSNIT) pairs in master data:")
    for (emp, ssnit), count in duplicates.items():
        print(f"  - {emp} + {ssnit}: appears {count} times")
    print("  -> These may cause unexpected behavior. Review master data!")
else:
    print("No duplicate (Employee#, SSNIT) pairs found in master data")

# =============================================================================
# STEP 2: PROCESS ALL SCHEDULE FILES
# =============================================================================

print("\nStep 2: Processing schedule files...")
print("-" * 80)

files_processed = 0
files_updated = 0
files_skipped = 0
total_updates = 0
total_rows_checked = 0
total_empty_unknown_skipped = 0
update_log = []
error_log = []
no_match_log = []

# Walk through all directories
for root, dirs, files in os.walk(SCHEDULES_FOLDER):
    # Skip the output folder and analysis output folder
    if OUTPUT_FOLDER in root or 'ANALYSIS_OUTPUT' in root:
        continue
    
    for file in files:
        ext = os.path.splitext(file)[1].lower()
        
        if ext in ['.xlsx', '.xls', '.xlsm']:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(root, SCHEDULES_FOLDER)
            
            try:
                files_processed += 1
                print(f"\n[{files_processed}] Processing: {file}")
                
                # Read all sheets
                xl_file = pd.ExcelFile(file_path)
                file_had_updates = False
                sheets_data = {}
                
                for sheet_name in xl_file.sheet_names:
                    try:
                        # Read sheet
                        df = pd.read_excel(file_path, sheet_name=sheet_name, dtype=str)
                        
                        # Get column names for A, B, C
                        if len(df.columns) < 3:
                            sheets_data[sheet_name] = df
                            continue
                        
                        col_A = df.columns[0]  # Employee Number
                        col_C = df.columns[2]  # SSNIT Number
                        
                        # Track updates in this sheet
                        sheet_updates = 0
                        rows_checked = 0
                        empty_unknown_skipped = 0
                        
                        # Process each row
                        for idx, row in df.iterrows():
                            schedule_emp_num = str(row[col_A]).strip() if pd.notna(row[col_A]) else ''
                            schedule_ssnit = str(row[col_C]).strip() if pd.notna(row[col_C]) else ''
                            
                            # Skip completely empty rows
                            if schedule_emp_num == '' and schedule_ssnit == '':
                                continue
                            
                            rows_checked += 1
                            total_rows_checked += 1
                            
                            # === NEW: SKIP EMPTY/UNKNOWN EMPLOYEE NUMBERS ===
                            if schedule_emp_num in ['', 'EMPTY_EMP', 'UNKNOWN_EMP', 'UNKNOWN', 'nan', 'None']:
                                empty_unknown_skipped += 1
                                total_empty_unknown_skipped += 1
                                continue  # Leave as-is, no processing
                            
                            # Create lookup key
                            key = (schedule_emp_num, schedule_ssnit)
                            
                            # Check if this combination exists in our mapping
                            if key in mapping:
                                mapping_info = mapping[key]
                                
                                # Check status
                                if 'Reassigned' in mapping_info['status']:
                                    # This person was reassigned a new employee number
                                    new_emp_num = mapping_info['new_emp_num']
                                    
                                    # Update it!
                                    df.at[idx, col_A] = new_emp_num
                                    sheet_updates += 1
                                    
                                    update_log.append({
                                        'FILE': file,
                                        'SHEET': sheet_name,
                                        'ROW': idx + 2,  # +2 for Excel row (header + 0-index)
                                        'OLD_EMP_NUM': schedule_emp_num,
                                        'SSNIT': schedule_ssnit,
                                        'NEW_EMP_NUM': new_emp_num,
                                        'STATUS': mapping_info['status']
                                    })
                                # If status is 'Kept', do nothing (continue)
                            else:
                                # No match found - log for review
                                if schedule_emp_num not in ['', 'EMPTY_EMP', 'UNKNOWN_EMP', 'UNKNOWN']:
                                    no_match_log.append({
                                        'FILE': file,
                                        'SHEET': sheet_name,
                                        'ROW': idx + 2,
                                        'EMPLOYEE_NUMBER': schedule_emp_num,
                                        'SSNIT': schedule_ssnit,
                                        'REASON': 'Not found in master data'
                                    })
                        
                        sheets_data[sheet_name] = df
                        
                        if sheet_updates > 0:
                            print(f"  Sheet '{sheet_name}': {sheet_updates} updates | {rows_checked} rows checked | {empty_unknown_skipped} empty/unknown skipped")
                            file_had_updates = True
                            total_updates += sheet_updates
                        elif rows_checked > 0:
                            print(f"  Sheet '{sheet_name}': No updates | {rows_checked} rows checked | {empty_unknown_skipped} empty/unknown skipped")
                        
                    except Exception as e:
                        print(f"  Error in sheet '{sheet_name}': {str(e)}")
                        error_log.append({
                            'FILE': file,
                            'SHEET': sheet_name,
                            'ROW': 'N/A',
                            'ISSUE': f'Sheet processing error: {str(e)}'
                        })
                        # Keep original sheet data
                        sheets_data[sheet_name] = pd.read_excel(file_path, sheet_name=sheet_name)
                
                # Save the file
                output_dir = os.path.join(OUTPUT_FOLDER, relative_path)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                
                output_path = os.path.join(output_dir, file)
                
                # Write all sheets to new file
                with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                    for sheet_name, sheet_df in sheets_data.items():
                        sheet_df.to_excel(writer, sheet_name=sheet_name, index=False)
                
                if file_had_updates:
                    print(f"  Saved with updates to: {relative_path}\\{file}")
                    files_updated += 1
                else:
                    print(f"  Saved (no changes) to: {relative_path}\\{file}")
                    files_skipped += 1
            
            except Exception as e:
                print(f"  ERROR processing file: {str(e)}")
                error_log.append({
                    'FILE': file,
                    'SHEET': 'N/A',
                    'ROW': 'N/A',
                    'ISSUE': f'File processing error: {str(e)}'
                })

# =============================================================================
# STEP 3: GENERATE REPORTS
# =============================================================================

print("\n" + "=" * 80)
print("GENERATING REPORTS")
print("=" * 80)

# Save update log
if update_log:
    log_df = pd.DataFrame(update_log)
    log_file = os.path.join(OUTPUT_FOLDER, "Update_Log.csv")
    log_df.to_csv(log_file, index=False)
    print(f"Update log saved: {log_file} ({len(update_log)} updates)")

# Save no-match log
if no_match_log:
    no_match_df = pd.DataFrame(no_match_log)
    no_match_file = os.path.join(OUTPUT_FOLDER, "No_Match_Log.csv")
    no_match_df.to_csv(no_match_file, index=False)
    print(f"No-match log saved: {no_match_file} ({len(no_match_log)} records)")

# Save error log
if error_log:
    error_df = pd.DataFrame(error_log)
    error_file = os.path.join(OUTPUT_FOLDER, "Error_Log.csv")
    error_df.to_csv(error_file, index=False)
    print(f"Error log saved: {error_file} ({len(error_log)} errors)")

# Generate text report
report = []
report.append("=" * 80)
report.append("SCHEDULE UPDATE REPORT - EMPLOYEE# + SSNIT MATCHING")
report.append("=" * 80)
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"Master File: {CORRECTED_MASTER_FILE}")
report.append(f"Schedules Folder: {SCHEDULES_FOLDER}")
report.append(f"Output Folder: {OUTPUT_FOLDER}")
report.append("")

report.append("=" * 80)
report.append("SUMMARY")
report.append("=" * 80)
report.append(f"Mappings Loaded from Master:        {len(mapping):,}")
report.append(f"  - Status 'Kept':                  {kept_count:,}")
report.append(f"  - Status 'Reassigned':            {reassigned_count:,}")
report.append("")
report.append(f"Schedule Files Processed:           {files_processed:,}")
report.append(f"Schedule Files Updated:             {files_updated:,}")
report.append(f"Schedule Files (No Changes):        {files_skipped:,}")
report.append(f"Total Rows Checked:                 {total_rows_checked:,}")
report.append(f"Empty/Unknown Employee# Skipped:    {total_empty_unknown_skipped:,}")
report.append(f"Total Employee Number Updates:      {total_updates:,}")
report.append(f"Records Not Found in Master:        {len(no_match_log):,}")
report.append(f"Errors/Warnings:                    {len(error_log):,}")
report.append("")

report.append("=" * 80)
report.append("MATCHING LOGIC USED")
report.append("=" * 80)
report.append("For each row in schedule:")
report.append("1. Read Employee Number (Column A) and SSNIT Number (Column C)")
report.append("2. If Employee Number is EMPTY/UNKNOWN -> SKIP (leave as-is)")
report.append("3. Create key: (Employee_Number, SSNIT_Number)")
report.append("4. Look up key in Master_Data_Corrected.csv")
report.append("5. If found:")
report.append("   - Status = 'Kept' -> DO NOTHING (person kept their number)")
report.append("   - Status = 'Reassigned' -> UPDATE to new employee number")
report.append("6. If not found -> LOG for review (may need manual check)")
report.append("")
report.append("This approach:")
report.append("Matches on TWO fields (Employee# + SSNIT) for accuracy")
report.append("Preserves empty/unknown employee numbers")
report.append("Only updates records that were actually reassigned")
report.append("")

if update_log:
    report.append("=" * 80)
    report.append("UPDATES BY FILE")
    report.append("=" * 80)
    
    log_df = pd.DataFrame(update_log)
    file_summary = log_df.groupby('FILE').size().reset_index(name='UPDATE_COUNT')
    file_summary = file_summary.sort_values('UPDATE_COUNT', ascending=False)
    
    report.append(f"{'FILE':<50} | {'UPDATES'}")
    report.append("-" * 80)
    for _, row in file_summary.head(20).iterrows():
        report.append(f"{row['FILE']:<50} | {row['UPDATE_COUNT']}")
    
    if len(file_summary) > 20:
        report.append(f"... and {len(file_summary) - 20} more files")
    report.append("")

if no_match_log:
    report.append("=" * 80)
    report.append("RECORDS NOT FOUND IN MASTER (Top 20)")
    report.append("=" * 80)
    report.append("These schedule records didn't match any master data:")
    report.append("Review these manually - they may be:")
    report.append("  - New employees not in master data")
    report.append("  - Typos in employee number or SSNIT")
    report.append("  - Data from before/after your master data period")
    report.append("")
    
    no_match_df = pd.DataFrame(no_match_log)
    emp_summary = no_match_df.groupby('EMPLOYEE_NUMBER').size().reset_index(name='COUNT')
    emp_summary = emp_summary.sort_values('COUNT', ascending=False)
    
    report.append(f"{'EMPLOYEE_NUMBER':<20} | {'OCCURRENCES'}")
    report.append("-" * 80)
    for _, row in emp_summary.head(20).iterrows():
        report.append(f"{row['EMPLOYEE_NUMBER']:<20} | {row['COUNT']}")
    
    if len(emp_summary) > 20:
        report.append(f"... and {len(emp_summary) - 20} more employee numbers")
    report.append("")

if error_log:
    report.append("=" * 80)
    report.append("ERRORS/WARNINGS")
    report.append("=" * 80)
    error_df = pd.DataFrame(error_log)
    issue_summary = error_df.groupby('ISSUE').size().reset_index(name='COUNT')
    
    for _, row in issue_summary.iterrows():
        report.append(f"  - {row['ISSUE']}: {row['COUNT']} occurrences")
    report.append("")
    report.append("See Error_Log.csv for full details")
    report.append("")

report.append("=" * 80)
report.append("END OF REPORT")
report.append("=" * 80)

# Save text report
report_file = os.path.join(OUTPUT_FOLDER, "Schedule_Update_Report.txt")
with open(report_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(report))

print(f"Text report saved: {report_file}")

# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("UPDATE COMPLETE!")
print("=" * 80)
print(f"\nSUMMARY:")
print(f"  Files processed: {files_processed}")
print(f"  Files with updates: {files_updated}")
print(f"  Files without updates: {files_skipped}")
print(f"  Total rows checked: {total_rows_checked:,}")
print(f"  Empty/Unknown employee numbers skipped: {total_empty_unknown_skipped:,}")
print(f"  Total employee number updates: {total_updates}")
print(f"  Records not found in master: {len(no_match_log)}")
print(f"  Errors/warnings: {len(error_log)}")

print(f"\nOUTPUT LOCATION:")
print(f"  {OUTPUT_FOLDER}")

print("\nFILES GENERATED:")
print(f"  1. All schedule files (preserving folder structure)")
if update_log:
    print(f"  2. Update_Log.csv - Every change made ({len(update_log)} updates)")
if no_match_log:
    print(f"  3. No_Match_Log.csv - Records not found in master ({len(no_match_log)} records)")
print(f"  4. Schedule_Update_Report.txt - Summary report")
if error_log:
    print(f"  5. Error_Log.csv - Issues found during processing ({len(error_log)} errors)")

print("\nLOGIC APPLIED:")
print("  Matched using (Employee Number + SSNIT Number) combination")
print("  Updated ONLY when Status = 'Reassigned'")
print("  Left unchanged when Status = 'Kept'")
print("  SKIPPED all empty/unknown employee numbers (preserved as-is)")

print("\nNEXT STEPS:")
print("  1. Review Update_Log.csv to see all changes made")
if no_match_log:
    print(f"  2. Check No_Match_Log.csv - {len(no_match_log)} records not found in master")
if error_log:
    print(f"  3. Check Error_Log.csv for processing errors")
print("  4. Verify a few files manually to ensure correctness")
print("  5. Original files remain in PSWEPS_NEW folder (untouched)")

print("\n" + "=" * 80)
print("ALL DONE!")
print("=" * 80)