In [1]:
!pip install rapidfuzz sentence-transformers torch scikit-learn pandas numpy



"""
FINAL CORRECTED SOLUTION - READY TO USE
Column name fixed to: raw_specialty
"""

import pandas as pd
import numpy as np
import re
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from enum import Enum
import warnings
warnings.filterwarnings('ignore')

from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.isotonic import IsotonicRegression

print("✓ All imports successful")

# ============================================
# SECTION 1: COMPREHENSIVE ABBREVIATION MAP
# ============================================

MEDICAL_ABBREVIATIONS = {
    r"\bcardio\b": "cardiology",
    r"\bcard\b": "cardiology",
    r"\bcv\b": "cardiovascular",
    r"\bcvs\b": "cardiovascular",
    r"\bent\b": "otolaryngology",
    r"\bento\b": "otolaryngology",
    r"\bot\b": "otolaryngology",
    r"\bsurg\b": "surgery",
    r"\bcardiothoracic\b": "cardiac surgery",
    r"\bthracic\b": "thoracic surgery",
    r"\bobgyn\b": "obstetrics and gynecology",
    r"\bob-gyn\b": "obstetrics and gynecology",
    r"\bobs\b": "obstetrics",
    r"\bgyn\b": "gynecology",
    r"\burol\b": "urology",
    r"\buro\b": "urology",
    r"\bortho\b": "orthopedics",
    r"\borthopaedic\b": "orthopedics",
    r"\borthopedic\b": "orthopedics",
    r"\bpsych\b": "psychiatry",
    r"\bpsy\b": "psychiatry",
    r"\bneuro\b": "neurology",
    r"\bneuro surg\b": "neurological surgery",
    r"\bderma\b": "dermatology",
    r"\bderm\b": "dermatology",
    r"\bpath\b": "pathology",
    r"\blap path\b": "laboratory pathology",
    r"\brad\b": "radiology",
    r"\bradiotherapy\b": "radiation therapy",
    r"\bicu\b": "critical care medicine",
    r"\bccu\b": "cardiac care",
    r"\bcritical\b": "critical care medicine",
    r"\bpedi\b": "pediatrics",
    r"\bped\b": "pediatrics",
    r"\bpediatric\b": "pediatrics",
    r"\bim\b": "internal medicine",
    r"\bim doc\b": "internal medicine",
    r"\bpt\b": "physical therapy",
    r"\bphysical med\b": "physical medicine and rehabilitation",
    r"\bmd\b": "medical doctor",
    r"\brn\b": "registered nurse",
    r"\blpn\b": "licensed practical nurse",
    r"\bpa\b": "physician assistant",
    r"\bnp\b": "nurse practitioner",
}

class MatchMethod(Enum):
    EXACT_MATCH = "exact_match"
    FUZZY_MATCH = "fuzzy_match"
    SEMANTIC_MATCH = "semantic_match"
    FALLBACK_MATCH = "fallback_match"
    NO_MATCH = "no_match"
    EMPTY_INPUT = "empty_input"

@dataclass
class MatchResult:
    primary_code: str
    primary_confidence: float
    calibrated_confidence: float
    method: MatchMethod
    is_multi_specialty: bool
    alternatives: List[Tuple[str, float]]

class SpecialtyPreprocessor:
    def __init__(self):
        self.abbreviation_map = MEDICAL_ABBREVIATIONS

    def preprocess(self, text: str) -> Tuple[str, bool]:
        if pd.isna(text) or text == '':
            return '', False

        text = str(text).strip()
        if len(text) < 2:
            return '', False

        text = re.sub(r'\s*-\s*[0-9A-Z]{10}X?\s*$', '', text, flags=re.IGNORECASE)
        text = re.sub(r'^[0-9A-Z]{10}X?\s*-\s*', '', text, flags=re.IGNORECASE)

        text = text.lower()

        for abbrev_pattern, expansion in self.abbreviation_map.items():
            text = re.sub(abbrev_pattern, expansion, text, flags=re.IGNORECASE)

        text = re.sub(r'[/&]', ' and ', text)
        text = re.sub(r'[\-_]', ' ', text)
        text = re.sub(r'[,()]', ' ', text)

        stop_words = {'service', 'center', 'clinic', 'hospital', 'department',
                      'medical', 'healthcare', 'provider', 'physician', 'doctor',
                      'general', 'office', 'practice', 'specialty', 'specialization'}
        words = text.split()
        words = [w for w in words if w not in stop_words and len(w) > 1]
        text = ' '.join(words)

        text = self._fix_common_misspellings(text)

        text = re.sub(r'\s+', ' ', text).strip()

        is_compound = ' and ' in text or (len(text.split()) >= 3)

        return text, is_compound

    def _fix_common_misspellings(self, text: str) -> str:
        corrections = {
            'clinal': 'clinical',
            'cardiak': 'cardiac',
            'diabetus': 'diabetes',
            'ural': 'urology',
            'oncolog': 'oncology',
            'patho': 'pathology',
            'radiolog': 'radiology',
            'throacic': 'thoracic',
            'neurolog': 'neurology',
        }

        for typo, correction in corrections.items():
            text = re.sub(r'\b' + typo + r'\b', correction, text, flags=re.IGNORECASE)

        return text

class SpecialtyMatcher:
    def __init__(self, nucc_df: pd.DataFrame):
        self.nucc_df = nucc_df.copy()
        self.preprocessor = SpecialtyPreprocessor()

        self.code_to_display = dict(zip(nucc_df['Code'], nucc_df['Display_Name']))
        self.nucc_display_clean = [
            self.preprocessor.preprocess(name)[0] for name in nucc_df['Display_Name']
        ]

        try:
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            self.nucc_embeddings = self.model.encode(self.nucc_display_clean, convert_to_tensor=True)
            self.semantic_ready = True
            print("✓ Semantic model loaded")
        except Exception as e:
            print(f"⚠ Semantic model failed: {e}")
            self.semantic_ready = False

    def match(self, specialty: str) -> MatchResult:
        cleaned, is_compound = self.preprocessor.preprocess(specialty)

        if not cleaned or len(cleaned) < 2:
            return MatchResult(
                primary_code='JUNK',
                primary_confidence=0.0,
                calibrated_confidence=0.0,
                method=MatchMethod.EMPTY_INPUT,
                is_multi_specialty=False,
                alternatives=[]
            )

        exact_result = self._exact_match(cleaned)
        if exact_result:
            code, confidence = exact_result
            return self._create_result(code, confidence, MatchMethod.EXACT_MATCH, is_compound, cleaned)

        fuzzy_result = self._fuzzy_match(cleaned)
        if fuzzy_result and fuzzy_result[1] >= 0.85:
            code, confidence = fuzzy_result
            return self._create_result(code, confidence, MatchMethod.FUZZY_MATCH, is_compound, cleaned)

        if self.semantic_ready:
            semantic_result = self._semantic_match(cleaned)
            if semantic_result and semantic_result[1] >= 0.50:
                code, confidence = semantic_result
                return self._create_result(code, confidence, MatchMethod.SEMANTIC_MATCH, is_compound, cleaned)

        if is_compound and ' and ' in cleaned:
            multi_result = self._multi_specialty_match(cleaned)
            if multi_result:
                code, confidence = multi_result
                return self._create_result(code, confidence, MatchMethod.SEMANTIC_MATCH, True, cleaned)

        if fuzzy_result:
            code, confidence = fuzzy_result
            confidence = min(confidence, 0.45)
            return self._create_result(code, confidence, MatchMethod.FALLBACK_MATCH, is_compound, cleaned)

        return MatchResult(
            primary_code='JUNK',
            primary_confidence=0.0,
            calibrated_confidence=0.0,
            method=MatchMethod.NO_MATCH,
            is_multi_specialty=is_compound,
            alternatives=[]
        )

    def _exact_match(self, cleaned: str) -> Optional[Tuple[str, float]]:
        for i, nucc_clean in enumerate(self.nucc_display_clean):
            if nucc_clean == cleaned or (nucc_clean in cleaned and len(cleaned) > 5):
                if fuzz.ratio(cleaned, nucc_clean) > 95:
                    code = self.nucc_df.iloc[i]['Code']
                    return code, 0.98
        return None

    def _fuzzy_match(self, cleaned: str) -> Optional[Tuple[str, float]]:
        best_code = None
        best_score = 0

        for i, nucc_clean in enumerate(self.nucc_display_clean):
            score = fuzz.token_set_ratio(cleaned, nucc_clean) / 100.0
            if score > best_score:
                best_score = score
                best_code = self.nucc_df.iloc[i]['Code']

        if best_score >= 0.70:
            return best_code, best_score
        return None

    def _semantic_match(self, cleaned: str) -> Optional[Tuple[str, float]]:
        if not self.semantic_ready:
            return None

        input_embedding = self.model.encode(cleaned, convert_to_tensor=True)
        similarities = util.pytorch_cos_sim(input_embedding, self.nucc_embeddings)[0]

        best_idx = torch.argmax(similarities).item()
        best_score = float(similarities[best_idx])

        if best_score >= 0.40:
            code = self.nucc_df.iloc[best_idx]['Code']
            return code, best_score

        return None

    def _get_top_alternatives(self, cleaned: str, top_n: int = 5) -> List[Tuple[str, float]]:
        if not self.semantic_ready:
            return []

        input_embedding = self.model.encode(cleaned, convert_to_tensor=True)
        similarities = util.pytorch_cos_sim(input_embedding, self.nucc_embeddings)[0]

        top_scores, top_indices = torch.topk(similarities, k=min(top_n + 1, len(similarities)))

        alternatives = []
        for idx, score in zip(top_indices.tolist(), top_scores.tolist()):
            code = self.nucc_df.iloc[idx]['Code']
            if score >= 0.35:
                alternatives.append((code, float(score)))

        return alternatives

    def _multi_specialty_match(self, cleaned: str) -> Optional[Tuple[str, float]]:
        parts = [p.strip() for p in re.split(r'\s+and\s+', cleaned)]
        if len(parts) < 2:
            return None

        part_matches = []
        for part in parts:
            if len(part) < 3:
                continue
            result = self._semantic_match(part)
            if result:
                part_matches.append(result)

        if not part_matches:
            return None

        best_code, best_conf = max(part_matches, key=lambda x: x[1])
        return best_code, best_conf * 0.95

    def _create_result(self, code: str, confidence: float, method: MatchMethod,
                      is_compound: bool, cleaned: str) -> MatchResult:
        alternatives = self._get_top_alternatives(cleaned, top_n=5)
        alternatives = [(c, s) for c, s in alternatives if c != code and s < confidence]

        return MatchResult(
            primary_code=code,
            primary_confidence=confidence,
            calibrated_confidence=confidence,
            method=method,
            is_multi_specialty=is_compound,
            alternatives=alternatives
        )

class ConfidenceCalibrator:
    def __init__(self):
        self.iso_reg = IsotonicRegression(out_of_bounds='clip')
        self.is_fitted = False

    def fit(self, original_scores: np.ndarray, ground_truth: np.ndarray):
        original_scores = np.array(original_scores).flatten()
        ground_truth = np.array(ground_truth).flatten()

        self.iso_reg.fit(original_scores, ground_truth)
        self.is_fitted = True

class JunkClassifier:
    @staticmethod
    def should_classify_junk(result: MatchResult, raw_text: str) -> bool:
        if result.method == MatchMethod.EMPTY_INPUT:
            return True

        if len(raw_text.strip()) < 2:
            return True

        if result.method == MatchMethod.EXACT_MATCH:
            return result.primary_confidence < 0.95
        elif result.method == MatchMethod.FUZZY_MATCH:
            return result.primary_confidence < 0.80
        elif result.method == MatchMethod.SEMANTIC_MATCH:
            return result.primary_confidence < 0.50
        elif result.method == MatchMethod.FALLBACK_MATCH:
            return result.primary_confidence < 0.35
        elif result.method == MatchMethod.NO_MATCH:
            return True

        return False

class ProviderSpecialtyStandardizer:
    def __init__(self, nucc_df: pd.DataFrame):
        self.nucc_df = nucc_df
        self.matcher = SpecialtyMatcher(nucc_df)
        self.calibrator = ConfidenceCalibrator()

    def standardize(self, input_df: pd.DataFrame,
                   specialty_column: str = 'raw_specialty',
                   apply_calibration: bool = False) -> pd.DataFrame:
        """Main standardization function - NOTE: specialty_column defaults to 'raw_specialty'"""

        # Verify column exists
        if specialty_column not in input_df.columns:
            raise KeyError(f"Column '{specialty_column}' not found. Available: {input_df.columns.tolist()}")

        results = []

        for idx, row in input_df.iterrows():
            specialty = row[specialty_column]
            match_result = self.matcher.match(specialty)

            is_junk = JunkClassifier.should_classify_junk(match_result, specialty)

            if is_junk:
                match_result.primary_code = 'JUNK'
                match_result.primary_confidence = 0.0
                match_result.calibrated_confidence = 0.0
            elif apply_calibration:
                cal_score = self.calibrator.calibrate(
                    np.array([match_result.primary_confidence])
                )[0]
                match_result.calibrated_confidence = cal_score
            else:
                match_result.calibrated_confidence = self._simple_calibrate(
                    match_result.primary_confidence,
                    match_result.method
                )

            results.append(match_result)

            if (idx + 1) % 1000 == 0:
                print(f"  Processed {idx + 1} records...")

        return self._format_results(input_df, results, specialty_column)

    def _simple_calibrate(self, score: float, method: MatchMethod) -> float:
        if method == MatchMethod.EXACT_MATCH:
            return min(score * 1.02, 0.95)
        elif method == MatchMethod.FUZZY_MATCH:
            return min(score * 1.05, 0.90)
        elif method == MatchMethod.SEMANTIC_MATCH:
            return min(score ** 0.5, 0.85)
        elif method == MatchMethod.FALLBACK_MATCH:
            return min(score * 0.95, 0.50)
        else:
            return score

    def _format_results(self, input_df: pd.DataFrame,
                       results: List[MatchResult],
                       specialty_column: str) -> pd.DataFrame:
        output_rows = []

        for i, (idx, row) in enumerate(input_df.iterrows()):
            result = results[i]
            cleaned, _ = self.matcher.preprocessor.preprocess(row[specialty_column])

            output_row = {
                'Specialty': row[specialty_column],
                'Preprocessed': cleaned,
                'Primary_Code': result.primary_code,
                'Original_Confidence': round(result.primary_confidence, 4),
                'Calibrated_Confidence': round(result.calibrated_confidence, 4),
                'Method': result.method.value,
                'Is_Multi_Specialty': result.is_multi_specialty,
            }

            for j, (alt_code, alt_score) in enumerate(result.alternatives[:5]):
                output_row[f'Alternative_Code_{j+1}'] = alt_code
                output_row[f'Alternative_Score_{j+1}'] = round(float(alt_score), 4)

            output_rows.append(output_row)

        output_df = pd.DataFrame(output_rows)

        for j in range(1, 6):
            if f'Alternative_Code_{j}' not in output_df.columns:
                output_df[f'Alternative_Code_{j}'] = np.nan
                output_df[f'Alternative_Score_{j}'] = np.nan

        return output_df

    def compute_validation_metrics(self, output_df: pd.DataFrame) -> Dict:
        metrics = {}

        total = len(output_df)
        junk_count = (output_df['Primary_Code'] == 'JUNK').sum()

        metrics['total_records'] = total
        metrics['junk_records'] = junk_count
        metrics['mapped_records'] = total - junk_count
        metrics['junk_percentage'] = round((junk_count / total) * 100, 2)
        metrics['mapping_success_rate'] = round(((total - junk_count) / total) * 100, 2)

        non_junk = output_df[output_df['Primary_Code'] != 'JUNK']

        metrics['avg_original_confidence'] = round(non_junk['Original_Confidence'].mean(), 4)
        metrics['avg_calibrated_confidence'] = round(non_junk['Calibrated_Confidence'].mean(), 4)
        metrics['confidence_improvement'] = round(
            metrics['avg_calibrated_confidence'] - metrics['avg_original_confidence'], 4
        )

        metrics['method_distribution'] = output_df['Method'].value_counts().to_dict()

        metrics['confidence_by_method'] = {}
        for method in output_df['Method'].unique():
            method_data = non_junk[non_junk['Method'] == method]['Calibrated_Confidence']
            if len(method_data) > 0:
                metrics['confidence_by_method'][method] = round(method_data.mean(), 4)

        multi = output_df[output_df['Is_Multi_Specialty'] == True]
        if len(multi) > 0:
            metrics['multi_specialty_count'] = len(multi)
            metrics['multi_specialty_avg_confidence'] = round(
                multi[multi['Primary_Code'] != 'JUNK']['Calibrated_Confidence'].mean(), 4
            )

        low_conf = non_junk[non_junk['Calibrated_Confidence'] < 0.60]
        metrics['low_confidence_count'] = len(low_conf)
        metrics['low_confidence_percentage'] = round((len(low_conf) / len(non_junk)) * 100, 2)

        return metrics

print("✓ All classes loaded successfully")

# ============================================
# USAGE - JUST RUN THIS
# ============================================

if __name__ == "__main__":
    # Load your data
    print("\nLoading data...")
    nucc_df = pd.read_csv('nucc_taxonomy_master.csv')
    input_df = pd.read_csv('input_specialties.csv')

    print(f"✓ NUCC records: {len(nucc_df)}")
    print(f"✓ Input specialties: {len(input_df)}")
    print(f"✓ Input columns: {input_df.columns.tolist()}")

    # Create standardizer
    standardizer = ProviderSpecialtyStandardizer(nucc_df)

    # Run standardization with correct column name
    print("\n✓ Starting standardization...")
    output_df = standardizer.standardize(
        input_df,
        specialty_column='raw_specialty'  # <-- CORRECT COLUMN NAME
    )

    # Compute metrics
    metrics = standardizer.compute_validation_metrics(output_df)
    print("\n=== VALIDATION METRICS ===")
    for key, value in sorted(metrics.items()):
        print(f"{key}: {value}")

    # Save output
    output_df.to_csv('output_standardized_CORRECTED.csv', index=False)
    print("\n✓ Saved to output_standardized_CORRECTED.csv")
    print(f"\nOutput shape: {output_df.shape}")
    print("\nFirst few results:")
    print(output_df[['Specialty', 'Preprocessed', 'Primary_Code', 'Calibrated_Confidence', 'Method']].head(10))



# ============================================
# POST-PROCESSING: CREATE CONSOLIDATED DATAFRAME
# ============================================

# 1️⃣  Rename "Specialty" → "raw_specialty" to match your input naming
df_final = output_df.rename(columns={"Specialty": "raw_specialty"})

# 2️⃣  Collect all NUCC codes whose score ≥ 0.6 (from primary + alternatives)
def collect_codes(row):
    codes = []
    # primary
    if row["Primary_Code"] != "JUNK" and row["Calibrated_Confidence"] >= 0.6:
        codes.append(row["Primary_Code"])
    # alternatives
    for i in range(1, 6):
        alt_code = row.get(f"Alternative_Code_{i}")
        alt_score = row.get(f"Alternative_Score_{i}")
        if pd.notna(alt_code) and pd.notna(alt_score) and alt_score >= 0.6:
            codes.append(alt_code)

    # join or mark JUNK
    if not codes:
        return "JUNK"
    return " | ".join(sorted(set(codes)))

df_final["NUCC_Codes"] = df_final.apply(collect_codes, axis=1)

# 3️⃣  Junk flag
df_final["Junk"] = df_final["NUCC_Codes"].apply(lambda x: True if x == "JUNK" else False)

# 4️⃣  Add confidence column (copy of Calibrated_Confidence)
df_final["Confidence"] = df_final["Calibrated_Confidence"]

# 5️⃣  Keep only useful columns
df_final = df_final[["raw_specialty", "NUCC_Codes", "Confidence", "Junk"]]

# 6️⃣  Save the consolidated view
df_final.to_csv("output_standardized_SUMMARY.csv", index=False)
print("\n✓ Created summary file: output_standardized_SUMMARY.csv")
print(df_final.head(10))

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3
✓ All imports successful
✓ All classes loaded successfully

Loading data...
✓ NUCC records: 879
✓ Input specialties: 10050
✓ Input columns: ['raw_specialty']


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Semantic model loaded

✓ Starting standardization...
  Processed 1000 records...
  Processed 2000 records...
  Processed 3000 records...
  Processed 4000 records...
  Processed 5000 records...
  Processed 6000 records...
  Processed 7000 records...
  Processed 8000 records...
  Processed 9000 records...
  Processed 10000 records...

=== VALIDATION METRICS ===
avg_calibrated_confidence: 0.909
avg_original_confidence: 0.9575
confidence_by_method: {'semantic_match': np.float64(0.8307), 'fuzzy_match': np.float64(0.9), 'exact_match': np.float64(0.95), 'fallback_match': np.float64(0.4275)}
confidence_improvement: -0.0485
junk_percentage: 5.0
junk_records: 503
low_confidence_count: 44
low_confidence_percentage: 0.46
mapped_records: 9547
mapping_success_rate: 95.0
method_distribution: {'fuzzy_match': 4874, 'exact_match': 3586, 'semantic_match': 1043, 'no_match': 401, 'empty_input': 102, 'fallback_match': 44}
multi_specialty_avg_confidence: 0.9066
multi_specialty_count: 3673
total_records: 10