In [None]:
import pandas as pd
import os
from datetime import datetime

# File paths
input_file = r'C:\Users\spt-admin\Desktop\NEWD\master_list_by_ssnit.xlsx'
output_file = r'C:\Users\spt-admin\Desktop\NEWD\Master_List_List.xlsx'
log_file = r'C:\Users\spt-admin\Desktop\NEWD_DONE\CONSOLIDATION_LOG.txt'

# Read the Excel file
print("Reading Excel file...")
df = pd.read_excel(input_file)

# Function to normalize names (remove extra spaces, commas, standardize)
def normalize_name(name):
    if pd.isna(name):
        return ''
    # Convert to string and lowercase
    name_str = str(name).lower()
    # Remove commas
    name_str = name_str.replace(',', '')
    # Replace multiple spaces with single space
    name_str = ' '.join(name_str.split())
    # Strip leading/trailing spaces
    return name_str.strip()

# Add normalized name column
df['Name_Normalized'] = df['Employee_Name'].apply(normalize_name)

# Create log list
log_entries = []
log_entries.append(f"=== EMPLOYEE CONSOLIDATION LOG ===")
log_entries.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log_entries.append(f"Input file: {input_file}")
log_entries.append(f"Total records before: {len(df)}\n")

# Function to normalize SSNIT (remove trailing zeros)
def normalize_ssnit(ssnit):
    if pd.isna(ssnit) or str(ssnit).strip() == '':
        return None
    ssnit_str = str(ssnit).strip()
    # Remove decimal point if it exists (from float conversion)
    if '.' in ssnit_str:
        ssnit_str = ssnit_str.split('.')[0]
    # Remove trailing zeros
    return ssnit_str.rstrip('0') or '0'

# Add normalized SSNIT column
df['SSNIT_Normalized'] = df['SSNIT_Number'].apply(normalize_ssnit)

# Track rows to keep and rows to delete
rows_to_delete = []
grouped_count = 0

# Group by Employee_Name
print("Processing duplicates...")
for name, group in df.groupby('Name_Normalized', dropna=False):
    if len(group) == 1:
        continue  # No duplicates for this name
    
    # Sort by index to keep nearby rows together
    group = group.sort_index()
    
    # Track which rows have been processed
    processed_indices = set()
    
    # Process each row in the group
    for idx in group.index:
        if idx in processed_indices:
            continue
            
        current_row = df.loc[idx]
        employee_numbers = [str(current_row['Employee_Number'])]
        merge_indices = [idx]
        merge_reasons = []
        
        # Check for matches with other rows in the same name group
        for other_idx in group.index:
            if other_idx == idx or other_idx in processed_indices:
                continue
            
            other_row = df.loc[other_idx]
            should_merge = False
            reason = ""
            
            # ONLY CHECK: SSNIT matches after normalization
            if (current_row['SSNIT_Normalized'] is not None and 
                other_row['SSNIT_Normalized'] is not None and
                current_row['SSNIT_Normalized'] == other_row['SSNIT_Normalized']):
                should_merge = True
                reason = f"SSNIT match: {current_row['SSNIT_Number']} = {other_row['SSNIT_Number']} (normalized to {current_row['SSNIT_Normalized']})"
            
            if should_merge:
                employee_numbers.append(str(other_row['Employee_Number']))
                merge_indices.append(other_idx)
                merge_reasons.append(reason)
                processed_indices.add(other_idx)
        
        # If we found duplicates, merge them
        if len(employee_numbers) > 1:
            grouped_count += 1
            
            # Combine employee numbers with |
            combined_employee_numbers = ' | '.join(employee_numbers)
            
            # Update the first row with combined employee numbers
            df.at[idx, 'Employee_Number'] = combined_employee_numbers
            
            # Mark other rows for deletion
            rows_to_delete.extend(merge_indices[1:])
            
            # Log this consolidation
            log_entries.append(f"\n--- Group {grouped_count} ---")
            log_entries.append(f"Employee Name: {name}")
            log_entries.append(f"Combined Employee Numbers: {combined_employee_numbers}")
            log_entries.append(f"Rows merged: {len(employee_numbers)}")
            for i, reason in enumerate(merge_reasons):
                log_entries.append(f"  - {reason}")
            log_entries.append(f"Original Employee Numbers:")
            for i, emp_num in enumerate(employee_numbers):
                orig_idx = merge_indices[i]
                orig_row = df.loc[orig_idx]
                log_entries.append(f"  {emp_num}: SSNIT={orig_row['SSNIT_Number']}")
        
        processed_indices.add(idx)

# Remove duplicate rows
print(f"Removing {len(rows_to_delete)} duplicate rows...")
df_cleaned = df.drop(rows_to_delete)

# Remove the temporary normalized columns
df_cleaned = df_cleaned.drop(['SSNIT_Normalized', 'Name_Normalized'], axis=1)

# Save cleaned file
print("Saving cleaned Excel file...")
df_cleaned.to_excel(output_file, index=False)

# Add summary to log
log_entries.append(f"\n=== SUMMARY ===")
log_entries.append(f"Total records after: {len(df_cleaned)}")
log_entries.append(f"Records deleted: {len(rows_to_delete)}")
log_entries.append(f"Groups consolidated: {grouped_count}")

# Write log file
print("Writing log file...")
with open(log_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(log_entries))

print(f"\n✓ Process completed successfully!")
print(f"✓ Cleaned file saved to: {output_file}")
print(f"✓ Log file saved to: {log_file}")
print(f"✓ Records before: {len(df)}")
print(f"✓ Records after: {len(df_cleaned)}")
print(f"✓ Records removed: {len(rows_to_delete)}")
print(f"✓ Groups consolidated: {grouped_count}")