In [None]:
import pandas as pd
import re

# File paths
psweps_file = r'C:\Users\spt-admin\Desktop\New\psweps.xlsx'
input_file = r'C:\Users\spt-admin\Desktop\NEWD\EMPLOYER_SSNIT_CLEAN.xlsx'
output_file = r'C:\Users\spt-admin\Desktop\NEWD\EMPLOYER_SSNIT_CLEAN_WITH_SP.xlsx'

print("Loading files...")
# Load psweps file
df_psweps = pd.read_excel(psweps_file)
print(f"Loaded {len(df_psweps)} rows from psweps")

# Load EMPLOYER_SSNIT_CLEAN file
df_clean = pd.read_excel(input_file)
print(f"Loaded {len(df_clean)} rows from EMPLOYER_SSNIT_CLEAN")

# Parse pipe-separated values
def parse_pipe_separated(value):
    if pd.isna(value):
        return []
    return [x.strip() for x in str(value).split('|') if x.strip()]

print("\nBuilding SP_NUMBER lookup dictionary...")
# Create lookup dictionary: employee_number -> SP_NUMBER
sp_lookup = {}

for idx, row in df_psweps.iterrows():
    sp_number = row['SP_NUMBER']
    employee_numbers = parse_pipe_separated(row['Employee_Number'])
    
    # Map each employee number to this SP_NUMBER
    for emp_num in employee_numbers:
        sp_lookup[emp_num.strip()] = str(sp_number).strip()

print(f"Created lookup with {len(sp_lookup)} employee number mappings")

# Function to check if string contains only digits
def is_numeric_only(s):
    return s.isdigit()

# Function to select best employee number based on priority
def select_best_employee_number(emp_numbers):
    """
    Priority:
    1. Pure numeric with fewest digits
    2. First one with letters/strings
    """
    if not emp_numbers:
        return None
    
    # Separate into numeric-only and alphanumeric
    numeric_only = []
    alphanumeric = []
    
    for emp in emp_numbers:
        emp_clean = emp.strip()
        if is_numeric_only(emp_clean):
            numeric_only.append(emp_clean)
        else:
            alphanumeric.append(emp_clean)
    
    # Priority 1: Pick numeric with fewest digits
    if numeric_only:
        shortest = min(numeric_only, key=len)
        return shortest
    
    # Priority 2: Pick first alphanumeric
    if alphanumeric:
        return alphanumeric[0]
    
    # Fallback: return first one
    return emp_numbers[0].strip()

print("\nMatching employee numbers and adding SP_NUMBERs...")
sp_numbers = []
not_found_count = 0

for idx, row in df_clean.iterrows():
    if idx % 10000 == 0 and idx > 0:
        print(f"Processed {idx}/{len(df_clean)} rows...")
    
    employee_numbers = parse_pipe_separated(row['EMPLOYEE_NUMBERS'])
    
    # Select best employee number based on priority
    best_emp_num = select_best_employee_number(employee_numbers)
    
    # Look up SP_NUMBER
    if best_emp_num and best_emp_num in sp_lookup:
        sp_numbers.append(sp_lookup[best_emp_num])
    else:
        sp_numbers.append('')
        not_found_count += 1
        if not_found_count <= 10:  # Show first 10 not found
            print(f"  Warning: Employee number '{best_emp_num}' not found in psweps (from row {idx})")

# Add SP_NUMBER column
df_clean['SP_NUMBER'] = sp_numbers

print(f"\nSP_NUMBER matching complete!")
print(f"Total rows: {len(df_clean)}")
print(f"SP_NUMBERs found: {len(df_clean) - not_found_count}")
print(f"SP_NUMBERs not found: {not_found_count}")

# Save to Excel
print(f"\nSaving to {output_file}...")
df_clean.to_excel(output_file, index=False, engine='openpyxl')

print(f"\n Process completed successfully!")
print(f" File saved to: {output_file}")

# Show sample of results
print("\nSample of results (first 10 rows):")
sample = df_clean[['EMPLOYEE_NUMBERS', 'SP_NUMBER', 'REPRESENTATIVE_NAME']].head(10)
print(sample.to_string(index=False))