In [None]:
```python
import pandas as pd
import re
from fuzzywuzzy import fuzz
import numpy as np
from collections import defaultdict

# File paths
psweps_path = r"C:\Users\spt-admin\Desktop\NEW\psweps.xlsx"
gtcl_path = r"C:\Users\spt-admin\Desktop\NEW\GTCL.xlsx"
output_path = r"C:\Users\spt-admin\Desktop\NEW\SSNIT_MATCH_EMP.xlsx"

# Load the data
print("Loading files...")
psweps_df = pd.read_excel(psweps_path)
gtcl_df = pd.read_excel(gtcl_path)

print(f"PSWEPS loaded: {len(psweps_df)} rows")
print(f"GTCL loaded: {len(gtcl_df)} rows")
print(f"PSWEPS columns: {psweps_df.columns.tolist()}")
print(f"GTCL columns: {gtcl_df.columns.tolist()}")

# Step 1: Clean individual identifier (no pipe splitting here)
def clean_identifier(value):
    if pd.isna(value):
        return ''
    # Convert to string, strip whitespace, remove special characters
    value = str(value).strip()
    value = re.sub(r'[^\w]', '', value)  # Remove non-alphanumeric
    return value.upper()

# Check if identifier is invalid
def is_invalid_identifier(value):
    if pd.isna(value) or value == '':
        return True
    value_str = str(value).strip().upper()
    
    # Exact matches for invalid values
    invalid_values = ['', '0', 'UNKNOWN', 'NAN', 'NA', 'N/A', 'NONE']
    if value_str in invalid_values:
        return True
    
    # Check if it contains certain keywords
    invalid_keywords = ['TRUST', 'UNKNOWN', 'NO_SSF_NUMBER', 'UNKNO', 'NOSSF']
    for keyword in invalid_keywords:
        if keyword in value_str:
            return True
    
    return False

# Split pipe-separated values and clean each
def parse_and_clean_identifiers(value):
    """
    Takes a value like "H016310070030|NO_SSF_NUMBER" 
    Returns a list of cleaned valid identifiers: ["H016310070030"]
    """
    if pd.isna(value):
        return []
    
    # Split by pipe
    parts = str(value).split('|')
    
    # Clean each part and filter out invalid ones
    cleaned = []
    for part in parts:
        cleaned_part = clean_identifier(part)
        if not is_invalid_identifier(cleaned_part):
            cleaned.append(cleaned_part)
    
    return cleaned

# Apply to PSWEPS - create lists of valid identifiers
print("\nParsing pipe-separated identifiers...")
psweps_df['SSNIT_LIST'] = psweps_df['SSNIT_Number'].apply(parse_and_clean_identifiers)
psweps_df['EMPNUM_LIST'] = psweps_df['Employee_Number'].apply(parse_and_clean_identifiers)

# Apply to GTCL - single value (no pipes expected)
gtcl_df['CLEANED_SSNIT_NO'] = gtcl_df['SSNIT No.'].apply(clean_identifier)
gtcl_df['VALID_SSNIT'] = ~gtcl_df['CLEANED_SSNIT_NO'].apply(is_invalid_identifier)

print(f"PSWEPS - Records with valid SSNIT: {(psweps_df['SSNIT_LIST'].str.len() > 0).sum()}")
print(f"PSWEPS - Records with valid EmpNum: {(psweps_df['EMPNUM_LIST'].str.len() > 0).sum()}")
print(f"GTCL - Records with valid SSNIT: {gtcl_df['VALID_SSNIT'].sum()}")

# Step 2: Build FAST lookup dictionaries
print("\nBuilding fast lookup dictionaries...")

# SSNIT lookup: {ssnit_value: [list of psweps row indices]}
ssnit_lookup = defaultdict(list)
for idx, row in psweps_df.iterrows():
    for ssnit in row['SSNIT_LIST']:
        ssnit_lookup[ssnit].append(idx)

# Employee Number lookup: {empnum_value: [list of psweps row indices]}
empnum_lookup = defaultdict(list)
for idx, row in psweps_df.iterrows():
    for empnum in row['EMPNUM_LIST']:
        empnum_lookup[empnum].append(idx)

print(f"SSNIT lookup built: {len(ssnit_lookup)} unique SSNITs")
print(f"EmpNum lookup built: {len(empnum_lookup)} unique Employee Numbers")

# Step 3: Parse and clean names
def parse_name(name):
    if pd.isna(name):
        return []
    
    # Convert to string and clean
    name = str(name).strip()
    
    # Remove quotes and extra spaces
    name = name.replace("'", "").replace('"', '')
    
    # Replace commas with spaces
    name = name.replace(',', ' ')
    
    # Split by whitespace and filter empty strings
    parts = [p.strip().upper() for p in name.split() if p.strip() and len(p.strip()) > 1]
    
    return parts

print("\nParsing names...")
psweps_df['NAME_PARTS'] = psweps_df['Employee_Name'].apply(parse_name)
gtcl_df['NAME_PARTS'] = gtcl_df['Name'].apply(parse_name)

# Step 4: Fuzzy name matching function
def calculate_name_similarity(name_parts_1, name_parts_2):
    """
    Calculate similarity between two sets of name parts.
    Returns: (match_count, avg_similarity_score)
    """
    if not name_parts_1 or not name_parts_2:
        return 0, 0
    
    matches = 0
    total_similarity = 0
    
    for part1 in name_parts_1:
        best_match_score = 0
        for part2 in name_parts_2:
            # Direct fuzzy matching
            score = fuzz.ratio(part1, part2)
            if score >= 85:
                best_match_score = max(best_match_score, score)
            # Check if one part is contained in the other (for concatenated names)
            elif len(part1) >= 3 and len(part2) >= 3:
                if part1 in part2 or part2 in part1:
                    best_match_score = max(best_match_score, 90)
        
        if best_match_score >= 85:
            matches += 1
            total_similarity += best_match_score
    
    avg_similarity = total_similarity / matches if matches > 0 else 0
    
    return matches, avg_similarity

# Step 5: FAST Matching logic using lookup dictionaries
def find_best_match(gtcl_row, psweps_df, lookup_dict, match_type='SSNIT'):
    """
    Find the best match in PSWEPS for a GTCL row using fast lookup.
    match_type: 'SSNIT' or 'EMPNUM'
    Returns: (sp_number, match_score, match_type, matched_identifier) or (None, 0, None, None)
    """
    gtcl_identifier = gtcl_row['CLEANED_SSNIT_NO']
    gtcl_name_parts = gtcl_row['NAME_PARTS']
    
    if is_invalid_identifier(gtcl_identifier):
        return None, 0, None, None
    
    # Fast lookup - get candidate row indices instantly
    candidate_indices = lookup_dict.get(gtcl_identifier, [])
    
    if len(candidate_indices) == 0:
        return None, 0, None, None
    
    # Find best name match among candidates
    best_match = None
    best_score = 0
    best_sp_number = None
    matched_id = None
    
    for idx in candidate_indices:
        candidate = psweps_df.iloc[idx]
        psweps_name_parts = candidate['NAME_PARTS']
        match_count, avg_similarity = calculate_name_similarity(gtcl_name_parts, psweps_name_parts)
        
        # Require at least 2 matching name parts with 85% similarity
        if match_count >= 2 and avg_similarity >= 85:
            if avg_similarity > best_score:
                best_score = avg_similarity
                best_sp_number = candidate['SP_NUMBER']
                best_match = match_type
                matched_id = gtcl_identifier
    
    return best_sp_number, best_score, best_match, matched_id

# Step 6: Process each GTCL row with FAST lookups
print("\nMatching GTCL records to PSWEPS (FAST MODE)...")
results = []

for idx, gtcl_row in gtcl_df.iterrows():
    # Try SSNIT match first (using fast lookup)
    sp_number, score, match_type, matched_id = find_best_match(
        gtcl_row, psweps_df, ssnit_lookup, match_type='SSNIT'
    )
    
    # If SSNIT match failed, try Employee Number match (using fast lookup)
    if sp_number is None:
        sp_number, score, match_type, matched_id = find_best_match(
            gtcl_row, psweps_df, empnum_lookup, match_type='EMPNUM'
        )
    
    results.append({
        'SP_NUMBER': sp_number if sp_number else '',
        'MATCH_TYPE': match_type if match_type else '',
        'MATCH_SCORE': score if score > 0 else '',
        'MATCHED_ID': matched_id if matched_id else ''
    })
    
    if (idx + 1) % 500 == 0:
        print(f"Processed {idx + 1}/{len(gtcl_df)} records...")

print(f"Processed all {len(gtcl_df)} records!")

# Create results dataframe
results_df = pd.DataFrame(results)

# Insert SP_NUMBER into Column A of GTCL
gtcl_df.insert(0, 'SP_NUMBER', results_df['SP_NUMBER'])
gtcl_df.insert(1, 'MATCH_TYPE', results_df['MATCH_TYPE'])
gtcl_df.insert(2, 'MATCH_SCORE', results_df['MATCH_SCORE'])
gtcl_df.insert(3, 'MATCHED_ID', results_df['MATCHED_ID'])

# Remove temporary columns
gtcl_df = gtcl_df.drop(columns=['CLEANED_SSNIT_NO', 'VALID_SSNIT', 'NAME_PARTS'])

print(f"\nMatching complete!")

# Step 7: Generate statistics
total_records = len(gtcl_df)
ssnit_matches = (gtcl_df['MATCH_TYPE'] == 'SSNIT').sum()
empnum_matches = (gtcl_df['MATCH_TYPE'] == 'EMPNUM').sum()
no_matches = (gtcl_df['SP_NUMBER'] == '').sum()

print("\n" + "="*80)
print("MATCHING STATISTICS")
print("="*80)
print(f"Total GTCL records:          {total_records}")
print(f"Matched via SSNIT:           {ssnit_matches} ({ssnit_matches/total_records*100:.1f}%)")
print(f"Matched via Employee Number: {empnum_matches} ({empnum_matches/total_records*100:.1f}%)")
print(f"No match found:              {no_matches} ({no_matches/total_records*100:.1f}%)")
print(f"Total matched:               {ssnit_matches + empnum_matches} ({(ssnit_matches + empnum_matches)/total_records*100:.1f}%)")

# Step 8: Save to Excel
print(f"\nSaving to Excel...")
gtcl_df.to_excel(output_path, index=False, engine='openpyxl')

print(f"Saved to: {output_path}")

# Show sample of results
print("\nSample of matched results:")
matched_sample = gtcl_df[gtcl_df['SP_NUMBER'] != ''][['SP_NUMBER', 'MATCH_TYPE', 'MATCH_SCORE', 'MATCHED_ID', 'Name', 'SSNIT No.']].head(10)
if len(matched_sample) > 0:
    print(matched_sample)

print("\nSample of unmatched results:")
unmatched = gtcl_df[gtcl_df['SP_NUMBER'] == ''][['Name', 'SSNIT No.']].head(5)
if len(unmatched) > 0:
    print(unmatched)
else:
    print("No unmatched records!")

# Show example of pipe-separated handling
print("\n" + "="*80)
print("EXAMPLE: Checking Batumi Mahamudu case")
print("="*80)
Batumi_psweps = psweps_df[psweps_df['Employee_Name'].str.contains('Batumi', case=False, na=False)]
if len(Batumi_psweps) > 0:
    for idx, row in Batumi_psweps.head(2).iterrows():
        print(f"\nPSWEPS Record:")
        print(f"  Name: {row['Employee_Name']}")
        print(f"  SSNIT_Number (raw): {row['SSNIT_Number']}")
        print(f"  SSNIT_LIST (parsed): {row['SSNIT_LIST']}")
        print(f"  SP_NUMBER: {row['SP_NUMBER']}")

Batumi_gtcl = gtcl_df[gtcl_df['Name'].str.contains('Batumi', case=False, na=False)]
if len(Batumi_gtcl) > 0:
    for idx, row in Batumi_gtcl.head(2).iterrows():
        print(f"\nGTCL Record:")
        print(f"  Name: {row['Name']}")
        print(f"  SSNIT No.: {row['SSNIT No.']}")
        print(f"  SP_NUMBER (matched): {row['SP_NUMBER']}")
        print(f"  MATCH_TYPE: {row['MATCH_TYPE']}")
        print(f"  MATCHED_ID: {row['MATCHED_ID']}")
```