In [None]:
import pandas as pd
from datetime import datetime

# File paths
input_file = r'C:\Users\spt-admin\Desktop\NEWD\EMPLOYER_FIDONE.xlsx'
output_file = r'C:\Users\spt-admin\Desktop\NEWD\EMPLOYER_SSNIT_CLEAN.xlsx'
log_file = r'C:\Users\spt-admin\Desktop\NEWD\SSNIT_CONSOLIDATION_LOG.txt'

# Read the Excel file
print("Reading Excel file...")
df = pd.read_excel(input_file)

print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

# Function to normalize SSNIT (remove trailing zeros)
def normalize_ssnit(ssnit):
    if pd.isna(ssnit) or str(ssnit).strip() == '':
        return None
    ssnit_str = str(ssnit).strip()
    # Remove decimal point if it exists (from float conversion)
    if '.' in ssnit_str:
        ssnit_str = ssnit_str.split('.')[0]
    # Remove trailing zeros
    return ssnit_str.rstrip('0') or '0'

# Function to normalize names (remove extra spaces, commas, standardize)
def normalize_name(name):
    if pd.isna(name):
        return ''
    # Convert to string and lowercase
    name_str = str(name).lower()
    # Remove commas
    name_str = name_str.replace(',', '')
    # Replace multiple spaces with single space
    name_str = ' '.join(name_str.split())
    # Strip leading/trailing spaces
    return name_str.strip()

# Add normalized name column
df['Name_Normalized'] = df['REPRESENTATIVE_NAME'].apply(normalize_name)

# Parse pipe-separated SSNITs and create normalized versions
def parse_and_normalize_ssnits(ssnit_string):
    if pd.isna(ssnit_string):
        return [], []
    original = [x.strip() for x in str(ssnit_string).split('|') if x.strip()]
    normalized = [normalize_ssnit(s) for s in original]
    return original, normalized

df['SSNIT_Original_List'] = df['SSNIT_NUMBERS'].apply(lambda x: parse_and_normalize_ssnits(x)[0])
df['SSNIT_Normalized_List'] = df['SSNIT_NUMBERS'].apply(lambda x: parse_and_normalize_ssnits(x)[1])

# Create log list
log_entries = []
log_entries.append(f"=== SSNIT TRAILING ZEROS CONSOLIDATION LOG ===")
log_entries.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log_entries.append(f"Input file: {input_file}")
log_entries.append(f"Total records before: {len(df)}\n")

# Track rows to keep and rows to delete
rows_to_delete = []
grouped_count = 0

# Group by Name_Normalized
print("Processing duplicates...")
for name, group in df.groupby('Name_Normalized', dropna=False):
    if len(group) == 1:
        continue  # No duplicates for this name
    
    # Sort by index to keep nearby rows together
    group = group.sort_index()
    
    # Track which rows have been processed
    processed_indices = set()
    
    # Process each row in the group
    for idx in group.index:
        if idx in processed_indices:
            continue
            
        current_row = df.loc[idx]
        current_ssnits_normalized = current_row['SSNIT_Normalized_List']
        
        employee_numbers = [str(current_row['EMPLOYEE_NUMBERS'])]
        ssnit_numbers = [str(current_row['SSNIT_NUMBERS'])]
        merge_indices = [idx]
        merge_reasons = []
        
        # Check for matches with other rows in the same name group
        for other_idx in group.index:
            if other_idx == idx or other_idx in processed_indices:
                continue
            
            other_row = df.loc[other_idx]
            other_ssnits_normalized = other_row['SSNIT_Normalized_List']
            
            should_merge = False
            matched_ssnits = []
            
            # Check if any normalized SSNIT matches
            for curr_ssnit_norm, curr_ssnit_orig in zip(current_ssnits_normalized, current_row['SSNIT_Original_List']):
                if curr_ssnit_norm is None:
                    continue
                for other_ssnit_norm, other_ssnit_orig in zip(other_ssnits_normalized, other_row['SSNIT_Original_List']):
                    if other_ssnit_norm is None:
                        continue
                    if curr_ssnit_norm == other_ssnit_norm and curr_ssnit_orig != other_ssnit_orig:
                        # SSNITs match after normalization but are different originally (trailing zeros)
                        should_merge = True
                        matched_ssnits.append(f"{curr_ssnit_orig} = {other_ssnit_orig} (normalized to {curr_ssnit_norm})")
            
            if should_merge:
                employee_numbers.append(str(other_row['EMPLOYEE_NUMBERS']))
                ssnit_numbers.append(str(other_row['SSNIT_NUMBERS']))
                merge_indices.append(other_idx)
                merge_reasons.extend(matched_ssnits)
                processed_indices.add(other_idx)
        
        # If we found duplicates, merge them
        if len(employee_numbers) > 1:
            grouped_count += 1
            
            # Combine employee numbers with |
            combined_employee_numbers = ' | '.join(employee_numbers)
            # Combine SSNIT numbers with |
            combined_ssnit_numbers = ' | '.join(ssnit_numbers)
            
            # Update the first row with combined values
            df.at[idx, 'EMPLOYEE_NUMBERS'] = combined_employee_numbers
            df.at[idx, 'SSNIT_NUMBERS'] = combined_ssnit_numbers
            
            # Mark other rows for deletion
            rows_to_delete.extend(merge_indices[1:])
            
            # Log this consolidation
            log_entries.append(f"\n--- Group {grouped_count} ---")
            log_entries.append(f"Name: {name}")
            log_entries.append(f"Combined Employee Numbers: {combined_employee_numbers}")
            log_entries.append(f"Combined SSNIT Numbers: {combined_ssnit_numbers}")
            log_entries.append(f"Rows merged: {len(employee_numbers)}")
            for reason in merge_reasons:
                log_entries.append(f"  - {reason}")
        
        processed_indices.add(idx)

# Remove duplicate rows
print(f"Removing {len(rows_to_delete)} duplicate rows...")
df_cleaned = df.drop(rows_to_delete)

# Remove the temporary columns
df_cleaned = df_cleaned.drop(['Name_Normalized', 'SSNIT_Original_List', 'SSNIT_Normalized_List'], axis=1)

# Save cleaned file
print("Saving cleaned Excel file...")
df_cleaned.to_excel(output_file, index=False)

# Add summary to log
log_entries.append(f"\n=== SUMMARY ===")
log_entries.append(f"Total records after: {len(df_cleaned)}")
log_entries.append(f"Records deleted: {len(rows_to_delete)}")
log_entries.append(f"Groups consolidated: {grouped_count}")

# Write log file
print("Writing log file...")
with open(log_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(log_entries))

print(f"\n Process completed successfully!")
print(f" Cleaned file saved to: {output_file}")
print(f" Log file saved to: {log_file}")
print(f" Records before: {len(df)}")
print(f" Records after: {len(df_cleaned)}")
print(f" Records removed: {len(rows_to_delete)}")
print(f" Groups consolidated: {grouped_count}")