In [3]:
import pandas as pd
from collections import defaultdict

# Load the data from first script output
file_path = r"C:\Users\spt-admin\Desktop\EMPLOYEE_NUM_PER_FIXED.xlsx"
df = pd.read_excel(file_path)

print(f"Loaded {len(df)} rows from EMPLOYEE_NUM_PER.xlsx")
print(f"Columns: {df.columns.tolist()}")

# Parse pipe-separated values
def parse_pipe_separated(value):
    if pd.isna(value):
        return []
    return [x.strip() for x in str(value).split('|') if x.strip()]

# Extract employee numbers and SSNITs from each row
df['EMP_LIST'] = df['EMPLOYEE_NUMBERS'].apply(parse_pipe_separated)
df['SSNIT_LIST'] = df['SSNIT_NUMBER'].apply(lambda x: [str(x).strip()] if not pd.isna(x) else [])

print(f"\nBuilding connection graph...")

# Build a graph where employee numbers are connected if they appear in the same row
# This will help us find transitive connections
class UnionFind:
    def __init__(self):
        self.parent = {}
    
    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])  # Path compression
        return self.parent[x]
    
    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)
        if root_x != root_y:
            self.parent[root_y] = root_x

# Use Union-Find to group all connected employee numbers
uf = UnionFind()

# For each row, connect all employee numbers in that row
for idx, row in df.iterrows():
    emp_list = row['EMP_LIST']
    if len(emp_list) > 1:
        # Connect all employee numbers in this group
        for i in range(1, len(emp_list)):
            uf.union(emp_list[0], emp_list[i])
    elif len(emp_list) == 1:
        # Ensure single employee numbers are in the graph
        uf.find(emp_list[0])

print(f"Connection graph built!")

# Group employee numbers by their root
groups = defaultdict(set)
for emp in uf.parent.keys():
    root = uf.find(emp)
    groups[root].add(emp)

print(f"Found {len(groups)} connected groups")

# Create a mapping: employee_number -> row_indices for fast lookup
print("Creating employee number to row index mapping...")
emp_to_rows = defaultdict(list)
for idx, row in df.iterrows():
    for emp in row['EMP_LIST']:
        emp_to_rows[emp].append(idx)

print(f"Mapping created with {len(emp_to_rows)} unique employee numbers")

# Now collect all SSNITs and names for each group
print("Collecting SSNITs and names for each group...")
result_data = []

for group_idx, (root, emp_set) in enumerate(groups.items()):
    if group_idx % 10000 == 0 and group_idx > 0:
        print(f"Processed {group_idx}/{len(groups)} groups...")
    
    # Find all row indices that contain any employee from this group
    relevant_row_indices = set()
    for emp in emp_set:
        relevant_row_indices.update(emp_to_rows.get(emp, []))
    
    # Collect SSNITs and names from only those relevant rows
    all_ssnits = set()
    all_names = set()
    
    for idx in relevant_row_indices:
        row = df.iloc[idx]
        all_ssnits.update(row['SSNIT_LIST'])
        if not pd.isna(row['REPRESENTATIVE_NAME']):
            all_names.add(str(row['REPRESENTATIVE_NAME']).strip())
    
    # Create result row
    result_data.append({
        'EMPLOYEE_NUMBERS': '|'.join(sorted(emp_set)),
        'SSNIT_NUMBERS': '|'.join(sorted(all_ssnits)),
        'REPRESENTATIVE_NAME': list(all_names)[0] if all_names else ''
    })

# Create result dataframe
result_df = pd.DataFrame(result_data)

# Sort by number of employee numbers (descending) to see biggest groups first
result_df['NUM_EMPLOYEES'] = result_df['EMPLOYEE_NUMBERS'].apply(lambda x: len(x.split('|')))
result_df = result_df.sort_values('NUM_EMPLOYEES', ascending=False).drop('NUM_EMPLOYEES', axis=1)

# Reorder columns
result_df = result_df[['SSNIT_NUMBERS', 'REPRESENTATIVE_NAME', 'EMPLOYEE_NUMBERS']]

print(f"\nFinal grouped records: {len(result_df)}")
print(f"Original records: {len(df)}")
print(f"Consolidation: {len(df)} → {len(result_df)} groups")

# Save to Excel
output_path = r"C:\Users\spt-admin\Desktop\NEWD\EMPLOYER_FIDONE.xlsx"
result_df.to_excel(output_path, index=False, engine='openpyxl')

print(f"\n Saved to: {output_path}")

# Show sample of results with most consolidation
print("\nTop 10 groups with most employee numbers consolidated:")
sample = result_df.head(10).copy()
sample['EMP_COUNT'] = sample['EMPLOYEE_NUMBERS'].apply(lambda x: len(x.split('|')))
sample['SSNIT_COUNT'] = sample['SSNIT_NUMBERS'].apply(lambda x: len(x.split('|')))
print(sample[['EMP_COUNT', 'SSNIT_COUNT', 'REPRESENTATIVE_NAME']].to_string())

Loaded 136413 rows from EMPLOYEE_NUM_PER.xlsx
Columns: ['SSNIT_NUMBER', 'REPRESENTATIVE_NAME', 'EMPLOYEE_NUMBERS', 'SOURCE_FILES']

Building connection graph...
Connection graph built!
Found 69838 connected groups
Creating employee number to row index mapping...
Mapping created with 72095 unique employee numbers
Collecting SSNITs and names for each group...
Processed 10000/69838 groups...
Processed 20000/69838 groups...
Processed 30000/69838 groups...
Processed 40000/69838 groups...
Processed 50000/69838 groups...
Processed 60000/69838 groups...

Final grouped records: 69838
Original records: 136413
Consolidation: 136413 → 69838 groups

✓ Saved to: C:\Users\spt-admin\Desktop\NEWD\EMPLOYER_FIDONE.xlsx

Top 10 groups with most employee numbers consolidated:
       EMP_COUNT  SSNIT_COUNT         REPRESENTATIVE_NAME
26972          8            2       AGYENIM-BOATENG KWAKU
65256          6            1  ABDUL-RASHID HASSAN PELPUO
5807           5            1        ANYIMADU-ANTWI KWAME
14