In [None]:
# CELL 1: Import Libraries and Setup
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from collections import defaultdict
import re
from datetime import datetime
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")
print(f"Processing started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


# CELL 2: Configuration and File Paths
# =============================================================================
# CONFIGURATION SETTINGS
# =============================================================================

# File paths
INPUT_FILE = r"C:\Users\spt-admin\Desktop\PSWEPS_NEWADD\01_Master_Data.csv"
OUTPUT_DIR = r"C:\Users\spt-admin\Desktop\PSWEPS_NEWADD\NEW FOLDERS"

# Processing parameters
FUZZY_MATCH_THRESHOLD = 80  # Name similarity threshold (80%)
SSNIT_TEMP_PATTERN = r'^\d+$'  # Numeric only
SSNIT_PERM_PATTERN = r'^[A-Z]\d{12}$'  # Letter + 12 digits

print("Configuration loaded:")
print(f"  Input file: {INPUT_FILE}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Fuzzy match threshold: {FUZZY_MATCH_THRESHOLD}%")


# CELL 3: Load Data
# =============================================================================
# LOAD INPUT DATA
# =============================================================================

print("\nLoading data...")
try:
    df = pd.read_csv(INPUT_FILE, dtype=str)
except FileNotFoundError:
    print(f"ERROR: The file was not found at the specified path.")
    print(f"Please check this path is correct: {INPUT_FILE}")
    exit() # Stop the script if the file isn't found

# Standardize column names by stripping whitespace
df.columns = df.columns.str.strip()

print("Columns found in CSV:", df.columns.tolist())

# --- TAILORED COLUMN MAPPING ---
# This logic now exactly matches your file's headers.
column_mapping = {}
for col in df.columns:
    col_lower = col.lower()
    if col_lower == 'employee_number':
        column_mapping[col] = 'Employee_Number'
    elif col_lower == 'full_name': # Specifically looks for 'full_name'
        column_mapping[col] = 'Employee_Name'
    elif col_lower == 'ssnit_number':
        column_mapping[col] = 'SSNIT_Number'

df.rename(columns=column_mapping, inplace=True)

# Check if the essential columns were found after renaming
required_cols = ['Employee_Number', 'Employee_Name', 'SSNIT_Number']
for col in required_cols:
    if col not in df.columns:
        print(f"\nCRITICAL ERROR: Could not find the required column '{col}'.")
        print(f"Please ensure your CSV has the columns EMPLOYEE_NUMBER, FULL_NAME, and SSNIT_NUMBER.")
        exit()

# Fill NaN values
df['Employee_Number'] = df['Employee_Number'].fillna('UNKNOWN_EMP')
df['Employee_Name'] = df['Employee_Name'].fillna('UNKNOWN_NAME')
df['SSNIT_Number'] = df['SSNIT_Number'].fillna('UNKNOWN')

# Strip whitespace
df['Employee_Number'] = df['Employee_Number'].str.strip()
df['Employee_Name'] = df['Employee_Name'].str.strip()
df['SSNIT_Number'] = df['SSNIT_Number'].str.strip()

print(f"\n Data loaded: {len(df)} records")
print(f"  Unique Employee Numbers: {df['Employee_Number'].nunique()}")
print(f"  Unique Names: {df['Employee_Name'].nunique()}")
print(f"  Unique SSNITs: {df['SSNIT_Number'].nunique()}")
print("\nSample data:")
print(df.head())

# CELL 4: Helper Functions - SSNIT Classification
# =============================================================================
# HELPER FUNCTIONS - SSNIT CLASSIFICATION
# =============================================================================

def classify_ssnit(ssnit):
    """
    Classify SSNIT as temporary, permanent, or unknown
    Returns: 'temp', 'perm', or 'unknown'
    """
    if pd.isna(ssnit) or str(ssnit).strip() in ['', 'UNKNOWN', 'nan', 'None']:
        return 'unknown'
    
    ssnit = str(ssnit).strip()
    
    # Check for permanent SSNIT (starts with letter, 13 chars total)
    if re.match(SSNIT_PERM_PATTERN, ssnit):
        return 'perm'
    
    # Check for temporary SSNIT (numeric only)
    if re.match(SSNIT_TEMP_PATTERN, ssnit):
        return 'temp'
    
    return 'unknown'

def validate_ssnit_combination(ssnit_list):
    """
    Check if SSNIT combination is valid for a single person
    Returns: (is_valid, violation_reason)
    """
    if not ssnit_list:
        return True, None
    
    # Classify all SSNITs
    temp_ssnits = [s for s in ssnit_list if classify_ssnit(s) == 'temp']
    perm_ssnits = [s for s in ssnit_list if classify_ssnit(s) == 'perm']
    
    # Check for violations
    if len(temp_ssnits) > 1:
        return False, f"Multiple temporary SSNITs: {temp_ssnits}"
    
    if len(perm_ssnits) > 1:
        return False, f"Multiple permanent SSNITs: {perm_ssnits}"
    
    return True, None

# Test the functions
test_ssnits = ['454621', 'H123981259123', 'UNKNOWN']
print("Testing SSNIT classification:")
for ssnit in test_ssnits:
    print(f"  {ssnit}: {classify_ssnit(ssnit)}")

print("\nTesting SSNIT validation:")
print(f"  [454621]: {validate_ssnit_combination(['454621'])}")
print(f"  [454621, H123981259123]: {validate_ssnit_combination(['454621', 'H123981259123'])}")
print(f"  [454621, 789012]: {validate_ssnit_combination(['454621', '789012'])}")


# CELL 5: Helper Functions - Name Matching
# =============================================================================
# HELPER FUNCTIONS - NAME MATCHING
# =============================================================================

def fuzzy_name_match(name1, name2, threshold=FUZZY_MATCH_THRESHOLD):
    """
    Check if two names are similar enough to be the same person
    Uses token_sort_ratio for better matching of reordered names
    """
    if pd.isna(name1) or pd.isna(name2):
        return False
    
    name1 = str(name1).strip().lower()
    name2 = str(name2).strip().lower()
    
    if name1 == name2:
        return True
    
    # Use token_sort_ratio to handle "John Smith" vs "Smith, John"
    score = fuzz.token_sort_ratio(name1, name2)
    return score >= threshold

def group_similar_names(names):
    """
    Group similar names together using fuzzy matching
    Returns: list of lists (each inner list is a group of similar names)
    """
    names = list(set(names))  # Remove exact duplicates
    groups = []
    
    for name in names:
        # Check if this name matches any existing group
        added = False
        for group in groups:
            if any(fuzzy_name_match(name, existing_name) for existing_name in group):
                group.append(name)
                added = True
                break
        
        # If no match found, create new group
        if not added:
            groups.append([name])
    
    return groups

# Test fuzzy matching
test_names = ["Kankoh Martin", "Martin, Kankoh", "Martin K.", "Millicent Aidoo"]
print("Testing name grouping:")
groups = group_similar_names(test_names)
for i, group in enumerate(groups, 1):
    print(f"  Group {i}: {group}")


# CELL 6: Core Algorithm - Process Employee Numbers
# =============================================================================
# CORE ALGORITHM - PROCESS EACH EMPLOYEE NUMBER
# =============================================================================

def process_employee_number(emp_num, emp_df):
    """
    Process all records for a single employee number
    Returns: list of person dictionaries
    """
    # Step 1: Get all records for this employee number
    records = emp_df[emp_df['Employee_Number'] == emp_num].copy()
    
    # Handle UNKNOWN SSNIT rule
    if len(records) > 1:
        # If shared, remove UNKNOWN records
        records = records[records['SSNIT_Number'] != 'UNKNOWN']
    # If only one person, keep UNKNOWN (no filtering needed)
    
    if len(records) == 0:
        return []
    
    # Step 2: Group by similar names
    unique_names = records['Employee_Name'].unique()
    name_groups = group_similar_names(unique_names)
    
    persons = []
    
    # Step 3: Process each name group
    for name_group in name_groups:
        # Get all records for this name group
        group_records = records[records['Employee_Name'].isin(name_group)]
        
        # Get all SSNITs for this name group
        ssnits = group_records['SSNIT_Number'].unique().tolist()
        ssnits = [s for s in ssnits if s != 'UNKNOWN']  # Exclude UNKNOWN
        
        # Step 4: Validate SSNIT combination
        is_valid, violation = validate_ssnit_combination(ssnits)
        
        if is_valid:
            # Single person
            persons.append({
                'names': name_group,
                'primary_name': name_group[0],
                'ssnits': ssnits,
                'records': group_records,
                'count': len(group_records)
            })
        else:
            # Need to split by SSNIT combination
            # Group records by their specific SSNIT combinations
            ssnit_combinations = defaultdict(list)
            
            for _, row in group_records.iterrows():
                # Get all SSNITs for this specific record's context
                # (all records with same name AND same set of SSNITs)
                record_ssnits = tuple(sorted([row['SSNIT_Number']]))
                ssnit_combinations[record_ssnits].append(row)
            
            # Create a person for each valid SSNIT combination
            for ssnit_combo, combo_records in ssnit_combinations.items():
                combo_df = pd.DataFrame(combo_records)
                persons.append({
                    'names': name_group,
                    'primary_name': name_group[0],
                    'ssnits': list(ssnit_combo),
                    'records': combo_df,
                    'count': len(combo_df)
                })
    
    # Step 5: Merge persons who share SSNITs
    merged_persons = []
    used_indices = set()
    
    for i, person1 in enumerate(persons):
        if i in used_indices:
            continue
        
        # Check if this person shares SSNITs with any other person
        merged = person1.copy()
        used_indices.add(i)
        
        for j, person2 in enumerate(persons[i+1:], start=i+1):
            if j in used_indices:
                continue
            
            # Check for shared SSNITs
            shared_ssnits = set(person1['ssnits']) & set(person2['ssnits'])
            
            if shared_ssnits:
                # Merge these persons
                merged['names'] = list(set(merged['names'] + person2['names']))
                merged['primary_name'] = '/'.join(sorted(set(merged['names'])))
                merged['ssnits'] = list(set(merged['ssnits'] + person2['ssnits']))
                merged['records'] = pd.concat([merged['records'], person2['records']])
                merged['count'] = len(merged['records'])
                used_indices.add(j)
        
        merged_persons.append(merged)
    
    return merged_persons

print(" Core processing function defined")


# CELL 7: Process All Employee Numbers
# =============================================================================
# PROCESS ALL EMPLOYEE NUMBERS
# =============================================================================

print("\nProcessing all employee numbers...")
print("This may take a few minutes depending on data size...")

all_persons = []
problematic_emp_numbers = {}

unique_emp_numbers = df['Employee_Number'].unique()

for idx, emp_num in enumerate(unique_emp_numbers, 1):
    if idx % 100 == 0:
        print(f"  Processed {idx}/{len(unique_emp_numbers)} employee numbers...")
    
    persons = process_employee_number(emp_num, df)
    
    # Track original employee number for each person
    for person in persons:
        person['original_emp_num'] = emp_num
        all_persons.append(person)
    
    # Track problematic cases
    if len(persons) > 1:
        problematic_emp_numbers[emp_num] = {
            'person_count': len(persons),
            'persons': persons
        }

print(f"\n Processing complete!")
print(f"  Total persons identified: {len(all_persons)}")
print(f"  Employee numbers with multiple people: {len(problematic_emp_numbers)}")


# CELL 8: Assign New Employee Numbers
# =============================================================================
# ASSIGN NEW EMPLOYEE NUMBERS
# =============================================================================

print("\nAssigning employee numbers...")

# Find highest existing employee number
def extract_number(emp_num):
    """Extract numeric portion from employee number"""
    try:
        # Try to convert entire string to number
        return int(emp_num)
    except:
        # Extract numbers from string
        numbers = re.findall(r'\d+', str(emp_num))
        if numbers:
            return int(numbers[-1])  # Return last number found
        return 0

max_number = max([extract_number(enum) for enum in df['Employee_Number'].unique()])
next_number = max_number + 1

print(f"  Highest existing number: {max_number}")
print(f"  Starting new assignments from: {next_number}")

# For each problematic employee number, assign new numbers
assignment_log = []

for emp_num, info in problematic_emp_numbers.items():
    persons = info['persons']
    
    # Sort by count (descending) - most occurrences first
    persons_sorted = sorted(persons, key=lambda x: (-x['count'], x['primary_name']))
    
    # First person keeps original number
    winner = persons_sorted[0]
    winner['new_emp_num'] = emp_num
    winner['status'] = 'Kept'
    winner['reason'] = f"Most occurrences ({winner['count']})"
    
    assignment_log.append({
        'old_emp_num': emp_num,
        'new_emp_num': emp_num,
        'name': winner['primary_name'],
        'ssnits': ';'.join(winner['ssnits']),
        'count': winner['count'],
        'status': 'Kept',
        'reason': winner['reason']
    })
    
    # Others get new numbers
    for person in persons_sorted[1:]:
        person['new_emp_num'] = str(next_number)
        person['status'] = 'Reassigned'
        
        # Determine reason
        if len([s for s in person['ssnits'] if classify_ssnit(s) == 'temp']) > 1:
            person['reason'] = "2+ temporary SSNITs detected"
        elif len([s for s in person['ssnits'] if classify_ssnit(s) == 'perm']) > 1:
            person['reason'] = "2+ permanent SSNITs detected"
        else:
            person['reason'] = "Shared employee number"
        
        assignment_log.append({
            'old_emp_num': emp_num,
            'new_emp_num': str(next_number),
            'name': person['primary_name'],
            'ssnits': ';'.join(person['ssnits']),
            'count': person['count'],
            'status': 'Reassigned',
            'reason': person['reason']
        })
        
        next_number += 1

# Handle persons who had no conflicts (kept original numbers)
for person in all_persons:
    if 'new_emp_num' not in person:
        person['new_emp_num'] = person['original_emp_num']
        person['status'] = 'Kept'
        person['reason'] = 'No sharing detected'

print(f" Assignment complete!")
print(f"  New employee numbers generated: {next_number - max_number - 1}")


# CELL 9: Generate Output File 1 - Master_Data_Corrected.csv
# =============================================================================
# OUTPUT FILE 1: Master_Data_Corrected.csv
# =============================================================================

print("\nGenerating Master_Data_Corrected.csv...")

corrected_records = []

for person in all_persons:
    for _, record in person['records'].iterrows():
        corrected_records.append({
            'Employee_Number': person['new_emp_num'],
            'Employee_Name': record['Employee_Name'],
            'SSNIT_Number': record['SSNIT_Number'],
            'Original_Employee_Number': person['original_emp_num'],
            'Status': person['status'],
            'Record_Count': person['count']
        })

df_corrected = pd.DataFrame(corrected_records)

# Sort by new employee number
df_corrected = df_corrected.sort_values('Employee_Number')

output_file = f"{OUTPUT_DIR}\\Master_Data_Corrected.csv"
df_corrected.to_csv(output_file, index=False)

print(f" Saved: {output_file}")
print(f"  Total records: {len(df_corrected)}")


# CELL 10: Generate Output File 2 - Employee_Number_Mapping.csv
# =============================================================================
# OUTPUT FILE 2: Employee_Number_Mapping.csv
# =============================================================================

print("\nGenerating Employee_Number_Mapping.csv...")

df_mapping = pd.DataFrame(assignment_log)

# Add all persons who kept original numbers and had no conflicts
for person in all_persons:
    if person['original_emp_num'] not in problematic_emp_numbers:
        df_mapping = pd.concat([df_mapping, pd.DataFrame([{
            'old_emp_num': person['original_emp_num'],
            'new_emp_num': person['new_emp_num'],
            'name': person['primary_name'],
            'ssnits': ';'.join(person['ssnits']),
            'count': person['count'],
            'status': 'Kept',
            'reason': 'No sharing detected'
        }])], ignore_index=True)

# Rename columns for output
df_mapping.columns = ['Old_Employee_Number', 'New_Employee_Number', 'Person_Name', 
                      'All_SSNITs', 'Record_Count', 'Action', 'Reason']

# Sort by old employee number
df_mapping = df_mapping.sort_values('Old_Employee_Number')

output_file = f"{OUTPUT_DIR}\\Employee_Number_Mapping.csv"
df_mapping.to_csv(output_file, index=False)

print(f" Saved: {output_file}")
print(f"  Total mappings: {len(df_mapping)}")
# CELL 11: Generate Output File 3 - Duplicate_Resolution_Report.txt
# =============================================================================
# OUTPUT FILE 3: Duplicate_Resolution_Report.txt
# =============================================================================

print("\nGenerating Duplicate_Resolution_Report.txt...")

report = []
report.append("=" * 80)
report.append("EMPLOYEE NUMBER DEDUPLICATION REPORT")
report.append("=" * 80)
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"Input File: {INPUT_FILE}")
report.append(f"Output Location: {OUTPUT_DIR}")
report.append("")

report.append("=" * 80)
report.append("SUMMARY STATISTICS")
report.append("=" * 80)
report.append(f"Total Records Processed:                   {len(df):,}")
report.append(f"Unique Employee Numbers (Before):        {df['Employee_Number'].nunique():,}")
report.append(f"Unique Employee Numbers (After):         {df_corrected['Employee_Number'].nunique():,}")
report.append(f"New Employee Numbers Generated:          {next_number - max_number - 1:,}")
report.append(f"Employee Numbers with Sharing Resolved:    {len(problematic_emp_numbers):,}")
report.append("")

report.append("=" * 80)
report.append("ISSUES IDENTIFIED")
report.append("=" * 80)
report.append(f"Employee Numbers with Multiple People:     {len(problematic_emp_numbers):,}")

# Count specific issue types
temp_violations = sum(1 for p in all_persons if '2+ temporary SSNITs' in p.get('reason', ''))
perm_violations = sum(1 for p in all_persons if '2+ permanent SSNITs' in p.get('reason', ''))
shared_ssnit_merges = sum(1 for p in all_persons if '/' in p['primary_name'])

report.append(f"- 2+ Temporary SSNITs Detected:          {temp_violations:,} cases")
report.append(f"- 2+ Permanent SSNITs Detected:          {perm_violations:,} cases")
report.append(f"- Name Groups Merged by Shared SSNIT:    {shared_ssnit_merges:,} cases")
report.append("")

report.append("=" * 80)
report.append("ACTIONS TAKEN")
report.append("=" * 80)
kept_count = sum(1 for p in all_persons if p['status'] == 'Kept')
reassigned_count = sum(1 for p in all_persons if p['status'] == 'Reassigned')
report.append(f"Original Employee Numbers Kept:          {kept_count:,} people")
report.append(f"New Employee Numbers Assigned:           {reassigned_count:,} people")
report.append("")

report.append("=" * 80)
report.append("TOP 10 MOST PROBLEMATIC EMPLOYEE NUMBERS")
report.append("=" * 80)
report.append(f"{'Rank':<6}| {'Old_Emp_No':<12}| {'People':<8}| {'Names_List'}")
report.append("-" * 80)

# Sort problematic employee numbers by person count
top_problems = sorted(problematic_emp_numbers.items(), 
                      key=lambda x: x[1]['person_count'], 
                      reverse=True)[:10]

for rank, (emp_num, info) in enumerate(top_problems, 1):
    names = '; '.join([p['primary_name'] for p in info['persons']])
    if len(names) > 50:
        names = names[:47] + "..."
    report.append(f"{rank:<6}| {emp_num:<12}| {info['person_count']:<8}| {names}")

report.append("")

report.append("=" * 80)
report.append("NEW EMPLOYEE NUMBER ALLOCATION")
report.append("=" * 80)
report.append(f"Number Generation Method: Sequential from highest existing")
report.append(f"Highest Existing Number Found: {max_number}")
report.append(f"New Numbers Assigned Range: {max_number + 1} - {next_number - 1}")
report.append("")

report.append("=" * 80)
report.append("FUZZY NAME MATCHING RESULTS")
report.append("=" * 80)
report.append(f"Fuzzy Matching Threshold: {FUZZY_MATCH_THRESHOLD}%")
report.append(f"Total Persons Identified: {len(all_persons):,}")
report.append("")

report.append("=" * 80)
report.append("NEXT STEPS")
report.append("=" * 80)
report.append("1. Review 'Records_For_Manual_Review.csv' for flagged cases")
report.append("2. Use 'Employee_Number_Mapping.csv' to update your 2010-2025 schedules")
report.append("3. Use 'Reassigned_Employee_Numbers_Register.xlsx' as quick reference")
report.append("4. Keep 'Master_Data_Corrected.csv' as your new clean master list")
report.append("")

report.append("=" * 80)
report.append("END OF REPORT")
report.append("=" * 80)

# Write report
output_file = f"{OUTPUT_DIR}\\Duplicate_Resolution_Report.txt"
# --- THIS IS THE LINE THAT WAS FIXED ---
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(report))

print(f" Saved: {output_file}")

# CELL 12: Generate Output File 4 - Records_For_Manual_Review.csv
# =============================================================================
# OUTPUT FILE 4: Records_For_Manual_Review.csv
# =============================================================================

print("\nGenerating Records_For_Manual_Review.csv...")

manual_review = []

for emp_num, info in problematic_emp_numbers.items():
    persons = info['persons']
    
    # Check for various issues
    names = [p['primary_name'] for p in persons]
    
    # Issue 1: Very different names (low fuzzy match)
    if len(names) == 2:
        if not fuzzy_name_match(names[0], names[1], threshold=50):
            manual_review.append({
                'Employee_Number': emp_num,
                'Issue_Type': 'Very different names',
                'Confidence_Level': 'Low',
                'Names_Found': '; '.join(names),
                'SSNIT_Count': sum(len(p['ssnits']) for p in persons),
                'Details': 'No fuzzy name match, verify these are different people',
                'Recommendation': 'Verify identity documents'
            })
    
    # Issue 2: Same person with multiple permanent SSNITs
    for person in persons:
        perm_ssnits = [s for s in person['ssnits'] if classify_ssnit(s) == 'perm']
        if len(perm_ssnits) > 1:
            manual_review.append({
                'Employee_Number': person['new_emp_num'],
                'Issue_Type': 'Multiple permanent SSNITs',
                'Confidence_Level': 'High',
                'Names_Found': person['primary_name'],
                'SSNIT_Count': len(perm_ssnits),
                'Details': f"Same name but {len(perm_ssnits)} permanent SSNITs: {', '.join(perm_ssnits)}",
                'Recommendation': 'Verify - likely different people with same name'
            })
    
    # Issue 3: Same person with multiple temp SSNITs
    for person in persons:
        temp_ssnits = [s for s in person['ssnits'] if classify_ssnit(s) == 'temp']
        if len(temp_ssnits) > 1:
            manual_review.append({
                'Employee_Number': person['new_emp_num'],
                'Issue_Type': 'Multiple temp SSNITs',
                'Confidence_Level': 'Medium',
                'Names_Found': person['primary_name'],
                'SSNIT_Count': len(temp_ssnits),
                'Details': f"Same name with {len(temp_ssnits)} temp SSNITs: {', '.join(temp_ssnits)}",
                'Recommendation': 'Check if sequential temporary assignments or different people'
            })

df_review = pd.DataFrame(manual_review)

if len(df_review) > 0:
    output_file = f"{OUTPUT_DIR}\\Records_For_Manual_Review.csv"
    df_review.to_csv(output_file, index=False)
    print(f" Saved: {output_file}")
    print(f"  Records flagged: {len(df_review)}")
else:
    print(" No records flagged for manual review")


# CELL 13: Generate Output File 5 - Reassigned_Employee_Numbers_Register.xlsx
# =============================================================================
# OUTPUT FILE 5: Reassigned_Employee_Numbers_Register.xlsx
# =============================================================================

print("\nGenerating Reassigned_Employee_Numbers_Register.xlsx...")

# Sheet 1: Reassigned Employees
reassigned = [p for p in all_persons if p['status'] == 'Reassigned']
sheet1_data = []

for person in reassigned:
    ssnits = person['ssnits']
    sheet1_data.append({
        'New_Emp_No': person['new_emp_num'],
        'Employee_Name': person['primary_name'],
        'SSNIT_1': ssnits[0] if len(ssnits) > 0 else '',
        'SSNIT_2': ssnits[1] if len(ssnits) > 1 else '',
        'Old_Emp_No': person['original_emp_num'],
        'Record_Count': person['count'],
        'Reason': person['reason'],
        'Date': datetime.now().strftime('%Y-%m-%d')
    })

df_sheet1 = pd.DataFrame(sheet1_data)

# Sheet 2: Kept Original Numbers
kept = [p for p in all_persons if p['status'] == 'Kept' and p['original_emp_num'] in problematic_emp_numbers]
sheet2_data = []

for person in kept:
    ssnits = person['ssnits']
    sheet2_data.append({
        'Employee_No': person['new_emp_num'],
        'Employee_Name': person['primary_name'],
        'SSNIT_1': ssnits[0] if len(ssnits) > 0 else '',
        'SSNIT_2': ssnits[1] if len(ssnits) > 1 else '',
        'Record_Count': person['count'],
        'Reason_Kept': person['reason']
    })

df_sheet2 = pd.DataFrame(sheet2_data)

# Sheet 3: Quick Search (all persons)
sheet3_data = []

for person in all_persons:
    ssnits = person['ssnits']
    
    # Add by name
    sheet3_data.append({
        'Search_Type': 'Name',
        'Search_Value': person['primary_name'],
        'Result_Type': person['status'],
        'Old_Emp_No': person['original_emp_num'],
        'New_Emp_No': person['new_emp_num'],
        'Person_Name': person['primary_name'],
        'SSNIT_1': ssnits[0] if len(ssnits) > 0 else '',
        'SSNIT_2': ssnits[1] if len(ssnits) > 1 else ''
    })
    
    # Add by each SSNIT
    for ssnit in ssnits:
        if ssnit and ssnit != 'UNKNOWN':
            sheet3_data.append({
                'Search_Type': 'SSNIT',
                'Search_Value': ssnit,
                'Result_Type': person['status'],
                'Old_Emp_No': person['original_emp_num'],
                'New_Emp_No': person['new_emp_num'],
                'Person_Name': person['primary_name'],
                'SSNIT_1': ssnits[0] if len(ssnits) > 0 else '',
                'SSNIT_2': ssnits[1] if len(ssnits) > 1 else ''
            })
    
    # Add by old employee number
    sheet3_data.append({
        'Search_Type': 'Old_Emp_No',
        'Search_Value': person['original_emp_num'],
        'Result_Type': person['status'],
        'Old_Emp_No': person['original_emp_num'],
        'New_Emp_No': person['new_emp_num'],
        'Person_Name': person['primary_name'],
        'SSNIT_1': ssnits[0] if len(ssnits) > 0 else '',
        'SSNIT_2': ssnits[1] if len(ssnits) > 1 else ''
    })

df_sheet3 = pd.DataFrame(sheet3_data)

# Sheet 4: Statistics Dashboard
sheet4_data = {
    'Metric': [
        'Total Records Processed',
        'Unique Employee Numbers (Before)',
        'Unique Employee Numbers (After)',
        'New Numbers Generated',
        'Employee Numbers That Had Sharing',
        'People Who Kept Original Numbers',
        'People Reassigned New Numbers',
        '',
        '=== ISSUE BREAKDOWN ===',
        '2+ Temporary SSNITs Detected',
        '2+ Permanent SSNITs Detected',
        'Name Groups Merged by Shared SSNIT',
        'Records with UNKNOWN SSNIT Removed'
    ],
    'Value': [
        len(df),
        df['Employee_Number'].nunique(),
        df_corrected['Employee_Number'].nunique(),
        next_number - max_number - 1,
        len(problematic_emp_numbers),
        kept_count,
        reassigned_count,
        '',
        '',
        temp_violations,
        perm_violations,
        shared_ssnit_merges,
        len(df) - len(df_corrected)
    ]
}

df_sheet4 = pd.DataFrame(sheet4_data)

# Write Excel file
output_file = f"{OUTPUT_DIR}\\Reassigned_Employee_Numbers_Register.xlsx"

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    df_sheet1.to_excel(writer, sheet_name='Reassigned Employees', index=False)
    df_sheet2.to_excel(writer, sheet_name='Kept Original Numbers', index=False)
    df_sheet3.to_excel(writer, sheet_name='Quick Search', index=False)
    df_sheet4.to_excel(writer, sheet_name='Statistics Dashboard', index=False)
    
    # Format the sheets
    workbook = writer.book
    
    # Format Sheet 1
    ws1 = workbook['Reassigned Employees']
    for cell in ws1[1]:
        cell.font = Font(bold=True)
        cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
        cell.font = Font(bold=True, color='FFFFFF')
    
    # Format Sheet 2
    ws2 = workbook['Kept Original Numbers']
    for cell in ws2[1]:
        cell.font = Font(bold=True)
        cell.fill = PatternFill(start_color='70AD47', end_color='70AD47', fill_type='solid')
        cell.font = Font(bold=True, color='FFFFFF')
    
    # Format Sheet 3
    ws3 = workbook['Quick Search']
    for cell in ws3[1]:
        cell.font = Font(bold=True)
        cell.fill = PatternFill(start_color='FFC000', end_color='FFC000', fill_type='solid')
        cell.font = Font(bold=True, color='000000')
    
    # Format Sheet 4
    ws4 = workbook['Statistics Dashboard']
    for cell in ws4[1]:
        cell.font = Font(bold=True)
        cell.fill = PatternFill(start_color='5B9BD5', end_color='5B9BD5', fill_type='solid')
        cell.font = Font(bold=True, color='FFFFFF')

print(f" Saved: {output_file}")
print(f"  Sheet 1: {len(df_sheet1)} reassigned employees")
print(f"  Sheet 2: {len(df_sheet2)} kept original numbers")
print(f"  Sheet 3: {len(df_sheet3)} searchable records")
print(f"  Sheet 4: Statistics dashboard")


# CELL 14: Final Summary and Verification
# =============================================================================
# FINAL SUMMARY AND VERIFICATION
# =============================================================================

print("\n" + "=" * 80)
print("PROCESSING COMPLETE!")
print("=" * 80)
print(f"\nExecution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nSUMMARY STATISTICS:")
print(f"  Total records processed: {len(df):,}")
print(f"  Unique employee numbers (before): {df['Employee_Number'].nunique():,}")
print(f"  Unique employee numbers (after): {df_corrected['Employee_Number'].nunique():,}")
print(f"  New employee numbers generated: {next_number - max_number - 1:,}")
print(f"  Employee numbers with multiple people: {len(problematic_emp_numbers):,}")
print(f"  People who kept original numbers: {kept_count:,}")