In [None]:

import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process
import re
import phonetics
from geopy.distance import geodesic
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class SimpleHotelMatcher:
    def __init__(self):
        """
        Inicjalizacja prostego matchera z własnym drzewem decyzyjnym
        """
        self.reference_hotels = None
        self.api_hotels = None
        self.country_mappings = self._create_country_mappings()
        
    def _create_country_mappings(self) -> Dict[str, str]:
        """
        Tworzenie mapowania krajów na kody ISO - kompletna lista
        """
        country_map = {
            # Rate Hawk API codes
            'TH': 'Thailand',
            'AE': 'United Arab Emirates', 
            'SC': 'Seychelles',
            
            # Lista referencyjna - wszystkie kraje
            'Turcja': 'Turkey',
            'Hiszpania': 'Spain', 
            'UAE': 'United Arab Emirates',
            'Czarnogóra': 'Montenegro',
            'Włochy': 'Italy',
            'Malezja': 'Malaysia',
            'Malediwy': 'Maldives',
            'Grecja': 'Greece',
            'Seszele': 'Seychelles',
            'Maroko': 'Morocco',
            'Egipt': 'Egypt',
            'Mauritius': 'Mauritius',
            'Wielka Brytania': 'United Kingdom',
            'Portugalia': 'Portugal',
            'Tajlandia': 'Thailand',
            'Szwajcaria': 'Switzerland',
            'Australia': 'Australia',
            'Indonezja': 'Indonesia',
            'Aruba': 'Aruba',
            'Grenlandia': 'Greenland',
            'Wietnam': 'Vietnam',
            'Francja': 'France',
            'USA': 'United States',
            'Singapur': 'Singapore',
            'Islandia': 'Iceland',
            'Qatar': 'Qatar',
            'RPA': 'South Africa',
            'Dominikana': 'Dominican Republic',
            'Explora': 'Chile',  # Explora cruise line
            
            # Alternative spellings
            'Turkey': 'Turkey',
            'Spain': 'Spain',
            'Montenegro': 'Montenegro',
            'Italy': 'Italy',
            'Malaysia': 'Malaysia',
            'Maldives': 'Maldives',
            'Greece': 'Greece',
            'Seychelles': 'Seychelles',
            'Morocco': 'Morocco',
            'Egypt': 'Egypt',
            'United Kingdom': 'United Kingdom',
            'Portugal': 'Portugal',
            'Thailand': 'Thailand',
            'Switzerland': 'Switzerland',
            'Australia': 'Australia',
            'Indonesia': 'Indonesia',
            'Vietnam': 'Vietnam',
            'France': 'France',
            'United States': 'United States',
            'Singapore': 'Singapore',
            'Iceland': 'Iceland',
            'Qatar': 'Qatar',
            'South Africa': 'South Africa',
            'Dominican Republic': 'Dominican Republic',
            'Chile': 'Chile'
        }
        return country_map
    
    def load_reference_hotels(self, csv_path: str) -> pd.DataFrame:
        """
        Ładowanie i przetwarzanie listy referencyjnej hoteli
        """
        print("\n" + "="*50)
        print("📂 KROK 1A: ŁADOWANIE LISTY REFERENCYJNEJ")
        print("="*50)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Załadowano plik: {csv_path}")
            print(f"📊 Znaleziono {len(df)} hoteli w pliku")
        except Exception as e:
            print(f"❌ Błąd podczas ładowania pliku: {e}")
            return None
        
        print("\n🧹 Czyszczenie i normalizacja danych...")
        
        # Czyszczenie i normalizacja danych referencyjnych
        df['clean_location'] = df['Lokalizacja'].str.strip()
        df['clean_hotel'] = df['Hotel'].str.strip()
        
        # Ekstrakcja kraju i miasta z lokalizacji
        print("🌍 Ekstrakcja krajów i miast...")
        df[['country', 'city']] = df['clean_location'].str.split(',', n=1, expand=True)
        df['country'] = df['country'].fillna('').str.strip()
        df['city'] = df['city'].fillna('').str.strip()
        
        # Dodanie kodów ISO
        print("🏳️ Mapowanie kodów krajów...")
        df['country_iso'] = df['country'].map(self.country_mappings)
        df['country_iso'] = df['country_iso'].fillna(df['country'])
        
        # Pokazanie statystyk krajów
        country_stats = df['country_iso'].value_counts()
        print(f"📈 Top 5 krajów:")
        for country, count in country_stats.head().items():
            print(f"   {country}: {count} hoteli")
        
        # Normalizacja nazw hoteli
        print("✂️ Normalizacja nazw hoteli...")
        df['normalized_name'] = df['clean_hotel'].apply(self._normalize_hotel_name)
        
        # Tworzenie unikalnego ID
        df['reference_id'] = df.index.astype(str).str.zfill(3)
        
        self.reference_hotels = df
        print(f"✅ Przetworzono {len(df)} hoteli referencyjnych")
        
        # Pokazanie przykładów normalizacji
        print(f"\n📋 Przykłady normalizacji nazw:")
        for i in range(min(3, len(df))):
            original = df.iloc[i]['clean_hotel']
            normalized = df.iloc[i]['normalized_name']
            print(f"   '{original}' → '{normalized}'")
        
        return df
    
    def load_api_hotels(self, csv_path: str) -> pd.DataFrame:
        """
        Ładowanie i przetwarzanie hoteli z Rate Hawk API
        """
        print("\n" + "="*50)
        print("📡 KROK 1B: ŁADOWANIE DANYCH Z RATE HAWK API")
        print("="*50)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Załadowano plik: {csv_path}")
            print(f"📊 Znaleziono {len(df)} hoteli z API")
        except Exception as e:
            print(f"❌ Błąd podczas ładowania pliku: {e}")
            return None
        
        print("\n🧹 Czyszczenie danych API...")
        
        # Podstawowe czyszczenie danych
        df['clean_name'] = df['name'].fillna('').str.strip()
        df['clean_city'] = df['city'].fillna('').str.strip()
        df['clean_country'] = df['country'].fillna('').str.strip()
        df['clean_address'] = df['address'].fillna('').str.strip()
        
        # Statystyki krajów w API
        country_stats = df['clean_country'].value_counts()
        print(f"🌍 Top 5 krajów w API:")
        for country, count in country_stats.head().items():
            print(f"   {country}: {count} hoteli")
        
        # Normalizacja nazw
        print("✂️ Normalizacja nazw hoteli API...")
        df['normalized_name'] = df['clean_name'].apply(self._normalize_hotel_name)
        
        # Mapowanie krajów na pełne nazwy
        print("🏳️ Mapowanie kodów krajów API...")
        df['country_full'] = df['clean_country'].map(self.country_mappings)
        df['country_full'] = df['country_full'].fillna(df['clean_country'])
        
        # Czyszczenie współrzędnych
        print("🗺️ Walidacja współrzędnych geograficznych...")
        df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['lng'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        coords_valid = df[['lat', 'lng']].notna().all(axis=1).sum()
        print(f"📍 Hoteli z prawidłowymi współrzędnymi: {coords_valid}/{len(df)} ({coords_valid/len(df)*100:.1f}%)")
        
        # Normalizacja chain names
        df['clean_chain'] = df['hotel_chain'].fillna('No chain').str.strip()
        chain_stats = df['clean_chain'].value_counts()
        print(f"🏨 Top 3 sieci hotelowe w API:")
        for chain, count in chain_stats.head(3).items():
            if chain != 'No chain':
                print(f"   {chain}: {count} hoteli")
        
        self.api_hotels = df
        print(f"✅ Przetworzono {len(df)} hoteli z API")
        
        return df
    
    def _normalize_hotel_name(self, name: str) -> str:
        """
        Normalizacja nazwy hotelu dla lepszego matchingu
        """
        if pd.isna(name) or not name:
            return ""
        
        # Konwersja na lowercase
        name = str(name).lower().strip()
        
        # Usunięcie znaków specjalnych ale zachowanie spacji
        name = re.sub(r'[^\w\s]', ' ', name)
        
        # Usunięcie typowych słów hotelowych
        stop_words = [
            'hotel', 'resort', 'spa', 'suites', 'inn', 'lodge', 
            'motel', 'palace', 'club', 'grand', 'royal'
        ]
        
        words = name.split()
        words = [word for word in words if word not in stop_words and len(word) > 1]
        
        # Usunięcie duplikatów zachowując kolejność
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        
        return ' '.join(unique_words)
    
    def exact_match(self) -> List[Dict]:
        """
        Exact matching - szukanie identycznych nazw i lokalizacji
        Z COUNTRY PRE-FILTERING dla optymalizacji
        """
        print("\n" + "="*50)
        print("🎯 KROK 2: EXACT MATCHING")
        print("="*50)
        print("Szukanie identycznych nazw i lokalizacji...")
        print("🔧 OPTYMALIZACJA: Pre-filtering po krajach")
        
        exact_matches = []
        total_comparisons = 0
        skipped_by_country = 0
        
        print(f"Sprawdzanie {len(self.reference_hotels)} hoteli referencyjnych vs {len(self.api_hotels)} z API\n")
        
        for ref_idx, ref_hotel in self.reference_hotels.iterrows():
            ref_name = ref_hotel['normalized_name']
            ref_country = ref_hotel['country_iso']
            ref_city = ref_hotel['city']
            
            print(f"Sprawdzanie [{ref_idx+1:2d}/{len(self.reference_hotels)}]: {ref_hotel['clean_hotel'][:50]}...")
            print(f"   🌍 Kraj: {ref_country}")
            
            # PRE-FILTERING: Tylko hotele z tego samego kraju lub podobnego
            candidates = []
            for _, api_hotel in self.api_hotels.iterrows():
                api_country = api_hotel['country_full']
                country_similarity = self._country_similarity(ref_country, api_country)
                
                # Filtruj tylko hotele z podobnym krajem (>0.7 similarity)
                if country_similarity > 0.7:
                    candidates.append((api_hotel, country_similarity))
                else:
                    skipped_by_country += 1
            
            print(f"   📍 Kandydatów po filtrze krajów: {len(candidates)}/{len(self.api_hotels)} ({len(candidates)/len(self.api_hotels)*100:.1f}%)")
            
            matches_for_hotel = 0
            
            # Sprawdzanie tylko kandydatów z podobnych krajów
            for api_hotel, country_sim in candidates:
                total_comparisons += 1
                
                api_name = api_hotel['normalized_name']
                api_city = api_hotel['clean_city']
                
                # Exact name match
                if ref_name and api_name and ref_name == api_name:
                    exact_matches.append({
                        'reference_id': ref_hotel['reference_id'],
                        'reference_name': ref_hotel['clean_hotel'],
                        'api_id': api_hotel['id'],
                        'api_name': api_hotel['clean_name'],
                        'match_type': 'exact_name',
                        'confidence': 1.0,
                        'decision_path': 'Identical normalized names'
                    })
                    matches_for_hotel += 1
                    print(f"   ✅ EXACT MATCH: {api_hotel['clean_name'][:40]}...")
                    continue
                
                # Near exact with location
                name_similarity = fuzz.ratio(ref_name, api_name) / 100.0
                if name_similarity > 0.95:
                    city_match = self._city_similarity(ref_city, api_city)
                    
                    if country_sim > 0.8 or city_match > 0.8:
                        exact_matches.append({
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'match_type': 'near_exact',
                            'confidence': min(name_similarity + 0.05, 1.0),
                            'decision_path': f'High name similarity ({name_similarity:.2f}) + location match'
                        })
                        matches_for_hotel += 1
                        print(f"   ✅ NEAR EXACT: {api_hotel['clean_name'][:40]}... (sim: {name_similarity:.2f})")
            
            if matches_for_hotel == 0:
                print(f"   ❌ Brak exact matches")
            print()
        
        print(f"🎯 EXACT MATCHING ZAKOŃCZONY!")
        print(f"✅ Znaleziono {len(exact_matches)} exact matches")
        print(f"📊 Coverage exact: {len(exact_matches)}/{len(self.reference_hotels)} ({len(exact_matches)/len(self.reference_hotels)*100:.1f}%)")
        print(f"⚡ Optymalizacja:")
        print(f"   📊 Porównań wykonanych: {total_comparisons:,}")
        print(f"   🚫 Pominięto przez filtr krajów: {skipped_by_country:,}")
        print(f"   🔥 Reduction: {(skipped_by_country/(total_comparisons+skipped_by_country))*100:.1f}%")
        
        return exact_matches
    
    def custom_decision_tree(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """
        WŁASNE DRZEWO DECYZYJNE - logika oparta na fuzzy matching
        
        Decision Tree Logic:
        1. Sprawdź high similarity (>80%) + same country → MATCH
        2. Sprawdź chain similarity + medium similarity (>60%) → MATCH  
        3. Sprawdź word order similarity + same city → MATCH
        4. Sprawdź phonetic + high partial ratio → MATCH
        5. Else → NO MATCH
        """
        
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Obliczenie wszystkich features
        features = self._calculate_features(ref_hotel, api_hotel)
        
        # DECISION TREE LOGIC
        
        # NODE 1: High Similarity + Same Country
        if features['fuzz_ratio'] > 0.80 and features['same_country'] > 0.8:
            return {
                'reference_id': ref_hotel['reference_id'],
                'reference_name': ref_hotel['clean_hotel'],
                'api_id': api_hotel['id'],
                'api_name': api_hotel['clean_name'],
                'match_type': 'decision_tree',
                'confidence': min(0.85 + features['fuzz_ratio'] * 0.1, 0.95),
                'decision_path': f"HIGH_SIMILARITY + SAME_COUNTRY (ratio: {features['fuzz_ratio']:.2f})"
            }
        
        # NODE 2: Chain Similarity + Medium Name Similarity  
        if features['chain_similarity'] > 0.8 and features['fuzz_ratio'] > 0.60:
            return {
                'reference_id': ref_hotel['reference_id'],
                'reference_name': ref_hotel['clean_hotel'],
                'api_id': api_hotel['id'],
                'api_name': api_hotel['clean_name'],
                'match_type': 'decision_tree',
                'confidence': 0.75 + features['fuzz_ratio'] * 0.1,
                'decision_path': f"CHAIN_MATCH + MEDIUM_SIMILARITY (chain: {features['chain_similarity']:.2f})"
            }
        
        # NODE 3: Word Order + Same City (Marriott Warsaw vs Warsaw Marriott)
        if features['word_order_sim'] > 0.80 and features['same_city'] > 0.7:
            return {
                'reference_id': ref_hotel['reference_id'],
                'reference_name': ref_hotel['clean_hotel'],
                'api_id': api_hotel['id'],
                'api_name': api_hotel['clean_name'],
                'match_type': 'decision_tree',
                'confidence': 0.70 + features['word_order_sim'] * 0.15,
                'decision_path': f"WORD_ORDER + SAME_CITY (word_order: {features['word_order_sim']:.2f})"
            }
        
        # NODE 4: Phonetic Similarity + High Partial Ratio
        if features['soundex_match'] == 1.0 and features['fuzz_partial'] > 0.85:
            return {
                'reference_id': ref_hotel['reference_id'],
                'reference_name': ref_hotel['clean_hotel'],
                'api_id': api_hotel['id'],
                'api_name': api_hotel['clean_name'],
                'match_type': 'decision_tree',
                'confidence': 0.65 + features['fuzz_partial'] * 0.1,
                'decision_path': f"PHONETIC + HIGH_PARTIAL (soundex + partial: {features['fuzz_partial']:.2f})"
            }
        
        # NODE 5: Token Sort High + Medium Overall Similarity
        if features['fuzz_token_sort'] > 0.85 and features['fuzz_ratio'] > 0.50:
            return {
                'reference_id': ref_hotel['reference_id'],
                'reference_name': ref_hotel['clean_hotel'],
                'api_id': api_hotel['id'],
                'api_name': api_hotel['clean_name'],
                'match_type': 'decision_tree',
                'confidence': 0.60 + features['fuzz_token_sort'] * 0.1,
                'decision_path': f"TOKEN_SORT + MEDIUM_SIM (token_sort: {features['fuzz_token_sort']:.2f})"
            }
        
        # NO MATCH
        return None
    
    def _calculate_features(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        """
        Obliczenie wszystkich features dla decision tree
        """
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        features = {}
        
        # String similarities
        if ref_name and api_name:
            features['fuzz_ratio'] = fuzz.ratio(ref_name, api_name) / 100.0
            features['fuzz_partial'] = fuzz.partial_ratio(ref_name, api_name) / 100.0
            features['fuzz_token_sort'] = fuzz.token_sort_ratio(ref_name, api_name) / 100.0
            features['fuzz_token_set'] = fuzz.token_set_ratio(ref_name, api_name) / 100.0
        else:
            features.update({
                'fuzz_ratio': 0, 'fuzz_partial': 0, 
                'fuzz_token_sort': 0, 'fuzz_token_set': 0
            })
        
        # Geographic similarity
        ref_country = ref_hotel.get('country_iso', '')
        api_country = api_hotel.get('country_full', '')
        features['same_country'] = self._country_similarity(ref_country, api_country)
        
        ref_city = ref_hotel.get('city', '')
        api_city = api_hotel.get('clean_city', '')
        features['same_city'] = self._city_similarity(ref_city, api_city)
        
        # Word order similarity
        features['word_order_sim'] = self._word_order_similarity(ref_name, api_name)
        
        # Phonetic similarity
        features['soundex_match'] = self._phonetic_similarity(ref_name, api_name)
        
        # Chain similarity
        features['chain_similarity'] = self._chain_similarity(ref_hotel, api_hotel)
        
        return features
    
    def _country_similarity(self, country1: str, country2: str) -> float:
        """
        Sprawdzenie podobieństwa krajów
        """
        if not country1 or not country2:
            return 0.0
        
        country1 = str(country1).lower().strip()
        country2 = str(country2).lower().strip()
        
        if country1 == country2:
            return 1.0
        
        # Sprawdzenie czy jeden zawiera drugi
        if country1 in country2 or country2 in country1:
            return 0.9
        
        # Fuzzy similarity dla krajów
        return fuzz.ratio(country1, country2) / 100.0
    
    def _city_similarity(self, city1: str, city2: str) -> float:
        """
        Sprawdzenie podobieństwa miast
        """
        if not city1 or not city2:
            return 0.0
        
        city1 = str(city1).lower().strip()
        city2 = str(city2).lower().strip()
        
        if city1 == city2:
            return 1.0
        
        return fuzz.ratio(city1, city2) / 100.0
    
    def _word_order_similarity(self, name1: str, name2: str) -> float:
        """
        Sprawdzenie czy to te same słowa w innej kolejności
        """
        if not name1 or not name2:
            return 0.0
        
        words1 = set(name1.lower().split())
        words2 = set(name2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _phonetic_similarity(self, name1: str, name2: str) -> float:
        """
        Sprawdzenie podobieństwa fonetycznego
        """
        if not name1 or not name2:
            return 0.0
        
        try:
            soundex1 = phonetics.soundex(name1.replace(' ', ''))
            soundex2 = phonetics.soundex(name2.replace(' ', ''))
            return 1.0 if soundex1 == soundex2 else 0.0
        except:
            return 0.0
    
    def _chain_similarity(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> float:
        """
        Sprawdzenie podobieństwa sieci hotelowych - rozszerzona lista
        """
        ref_name = ref_hotel.get('clean_hotel', '')
        api_chain = api_hotel.get('clean_chain', '')
        
        # Rozszerzona lista znanych sieci hotelowych z twojej listy
        chains = [
            # Luxury chains
            'four seasons', 'one only', 'waldorf astoria', 'mandarin oriental',
            'ritz carlton', 'ritz-carlton', 'st regis', 'shangri-la',
            
            # Premium chains  
            'marriott', 'hilton', 'hyatt', 'intercontinental', 'sheraton',
            'westin', 'doubletree', 'regent', 'fairmont', 'raffles',
            
            # Boutique/Regional
            'atlantis', 'ikos', 'movenpick', 'mövenpick', 'banyan tree',
            'six senses', 'rosewood', 'belmond', 'dusit thani',
            'angsana', 'chedi', 'lefay', 'forte village',
            
            # Resort brands
            'bahia del duque', 'grecotel', 'domes', 'sani', 'daios',
            'puente romano', 'marbella club', 'gloria', 'maxx royal',
            
            # Cruise/Special
            'quark expeditions', 'explora'
        ]
        
        ref_lower = ref_name.lower()
        api_lower = api_chain.lower()
        
        # Exact chain match
        for chain in chains:
            if chain in ref_lower and chain in api_lower:
                return 1.0
        
        # Fuzzy chain similarity
        if api_chain != 'No chain' and len(api_chain) > 3:
            chain_sim = fuzz.ratio(api_chain.lower(), ref_name.lower()) / 100.0
            if chain_sim > 0.6:  # Threshold dla chain similarity
                return chain_sim
        
        return 0.0
    
    def fuzzy_decision_matching(self) -> List[Dict]:
        """
        Fuzzy matching używając custom decision tree
        """
        print("\n" + "="*50)
        print("🧠 KROK 3: FUZZY DECISION TREE MATCHING")
        print("="*50)
        print("Używanie własnego drzewa decyzyjnego z fuzzy features...")
        
        fuzzy_matches = []
        total_hotels = len(self.reference_hotels)
        
        print(f"Sprawdzanie {total_hotels} hoteli referencyjnych")
        print("Decision Tree Logic:")
        print("  NODE 1: High Similarity (>80%) + Same Country")
        print("  NODE 2: Chain Match + Medium Similarity (>60%)")  
        print("  NODE 3: Word Order Match + Same City")
        print("  NODE 4: Phonetic Match + High Partial")
        print("  NODE 5: Token Sort + Medium Overall\n")
        
        for ref_idx, ref_hotel in self.reference_hotels.iterrows():
            print(f"Analizowanie [{ref_idx+1:2d}/{total_hotels}]: {ref_hotel['clean_hotel'][:50]}...")
            
            best_match = None
            best_confidence = 0
            candidates_checked = 0
            decision_paths = []
            
            for _, api_hotel in self.api_hotels.iterrows():
                candidates_checked += 1
                
                # Sprawdzenie przez decision tree
                match_result = self.custom_decision_tree(ref_hotel, api_hotel)
                
                if match_result and match_result['confidence'] > best_confidence:
                    best_confidence = match_result['confidence']
                    best_match = match_result
                    decision_paths.append(f"   Candidate: {api_hotel['clean_name'][:30]}... → {match_result['decision_path']}")
                
                # Progress co 500 kandydatów
                if candidates_checked % 500 == 0:
                    print(f"   Sprawdzono {candidates_checked}/{len(self.api_hotels)} kandydatów...")
            
            if best_match and best_confidence >= 0.6:  # Minimum threshold
                fuzzy_matches.append(best_match)
                print(f"   ✅ MATCH FOUND: {best_match['api_name'][:40]}...")
                print(f"   📊 Confidence: {best_confidence:.3f}")
                print(f"   🎯 Decision: {best_match['decision_path']}")
                
                # Pokaż najlepsze alternatywy
                if len(decision_paths) > 1:
                    print(f"   📋 Top alternatives:")
                    for path in decision_paths[-3:]:  # Last 3 best
                        print(path)
            else:
                print(f"   ❌ Brak matches (best confidence: {best_confidence:.3f})")
            
            print()
        
        print(f"🧠 FUZZY DECISION MATCHING ZAKOŃCZONY!")
        print(f"✅ Znaleziono {len(fuzzy_matches)} fuzzy matches")
        print(f"📊 Coverage fuzzy: {len(fuzzy_matches)}/{total_hotels} ({len(fuzzy_matches)/total_hotels*100:.1f}%)")
        
        return fuzzy_matches
    
    def run_complete_matching(self, reference_csv: str, api_csv: str) -> Dict:
        """
        Kompletny pipeline matchingu bez ML
        """
        print("🚀" + "="*58 + "🚀")
        print("🚀 HOTEL MAPPING ENGINE - SIMPLE DECISION TREE VERSION 🚀")
        print("🚀" + "="*58 + "🚀")
        print("Mapowanie hoteli premium do Rate Hawk API")
        print("Algorytm: Exact Match + Custom Decision Tree + Fuzzy Features")
        
        # Krok 1: Ładowanie danych
        self.load_reference_hotels(reference_csv)
        if self.reference_hotels is None:
            return {'error': 'Failed to load reference hotels'}
            
        self.load_api_hotels(api_csv)
        if self.api_hotels is None:
            return {'error': 'Failed to load API hotels'}
        
        # Krok 2: Exact matching
        exact_matches = self.exact_match()
        
        # Krok 3: Fuzzy decision tree matching
        fuzzy_matches = self.fuzzy_decision_matching()
        
        # Krok 4: Kombinowanie wyników (unikanie duplikatów)
        print("\n" + "="*50)
        print("🔗 KROK 4: KOMBINOWANIE WYNIKÓW")
        print("="*50)
        
        all_matches = exact_matches.copy()
        
        # Dodaj fuzzy matches które nie są już w exact matches
        exact_ref_ids = {match['reference_id'] for match in exact_matches}
        new_fuzzy_matches = 0
        
        for fuzzy_match in fuzzy_matches:
            if fuzzy_match['reference_id'] not in exact_ref_ids:
                all_matches.append(fuzzy_match)
                new_fuzzy_matches += 1
        
        print(f"📊 Exact matches: {len(exact_matches)}")
        print(f"📊 Nowe fuzzy matches: {new_fuzzy_matches}")
        print(f"📊 Duplikaty pominięte: {len(fuzzy_matches) - new_fuzzy_matches}")
        print(f"📊 Łączne matches: {len(all_matches)}")
        
        # Krok 5: Generowanie raportów
        print("\n" + "="*50)
        print("📈 KROK 5: GENEROWANIE RAPORTÓW")
        print("="*50)
        
        coverage = len(all_matches) / len(self.reference_hotels) * 100
        
        results = {
            'total_reference_hotels': len(self.reference_hotels),
            'total_api_hotels': len(self.api_hotels),
            'exact_matches': len(exact_matches),
            'fuzzy_matches': new_fuzzy_matches,
            'total_matches': len(all_matches),
            'coverage_percentage': coverage,
            'matches': all_matches
        }
        
        print(f"🎯 FINAL RESULTS:")
        print(f"   📚 Hoteli referencyjnych: {results['total_reference_hotels']}")
        print(f"   📡 Hoteli z API: {results['total_api_hotels']:,}")
        print(f"   ✅ Exact matches: {results['exact_matches']}")
        print(f"   🧠 Fuzzy matches: {results['fuzzy_matches']}")
        print(f"   🎯 Total matches: {results['total_matches']}")
        print(f"   📊 Coverage: {coverage:.1f}%")
        
        # Confidence distribution
        if all_matches:
            confidences = [match['confidence'] for match in all_matches]
            high_conf = sum(1 for c in confidences if c >= 0.9)
            med_conf = sum(1 for c in confidences if 0.7 <= c < 0.9)
            low_conf = sum(1 for c in confidences if c < 0.7)
            
            print(f"\n📊 CONFIDENCE DISTRIBUTION:")
            print(f"   🟢 High confidence (≥0.9): {high_conf}")
            print(f"   🟡 Medium confidence (0.7-0.9): {med_conf}")
            print(f"   🔴 Low confidence (<0.7): {low_conf}")
        
        return results
    
    def save_results(self, results: Dict, output_prefix: str = "hotel_mapping"):
        """
        Zapisywanie wyników do plików CSV
        """
        print("\nZapisywanie wyników...")
        
        # Plik 1: Lista referencyjna z kodami ISO
        reference_with_iso = self.reference_hotels[[
            'Lokalizacja', 'Hotel', 'country', 'city', 'country_iso', 'reference_id'
        ]].copy()
        
        reference_file = f"{output_prefix}_reference_with_iso.csv"
        reference_with_iso.to_csv(reference_file, index=False)
        print(f"Zapisano: {reference_file}")
        
        # Plik 2: Wszystkie matches
        if results['matches']:
            matches_df = pd.DataFrame(results['matches'])
            matches_file = f"{output_prefix}_matches.csv"
            matches_df.to_csv(matches_file, index=False)
            print(f"Zapisano: {matches_file}")
        
        # Plik 3: Podsumowanie
        summary = {
            'metric': [
                'Total Reference Hotels',
                'Total API Hotels', 
                'Exact Matches',
                'Fuzzy Decision Matches',
                'Total Matches',
                'Coverage Percentage'
            ],
            'value': [
                results['total_reference_hotels'],
                results['total_api_hotels'],
                results['exact_matches'],
                results['fuzzy_matches'],
                results['total_matches'],
                f"{results['coverage_percentage']:.1f}%"
            ]
        }
        
        summary_df = pd.DataFrame(summary)
        summary_file = f"{output_prefix}_summary.csv"
        summary_df.to_csv(summary_file, index=False)
        print(f"Zapisano: {summary_file}")
        
        print(f"\nCoverage: {results['coverage_percentage']:.1f}% ({results['total_matches']}/{results['total_reference_hotels']})")


# Użycie
if __name__ == "__main__":
    # Inicjalizacja matcher
    matcher = SimpleHotelMatcher()
    
    # Uruchomienie complete matching
    results = matcher.run_complete_matching(
        reference_csv="lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"
    )
    
    # Zapisanie wyników
    matcher.save_results(results)
    
    print("\n" + "=" * 60)
    print("SIMPLE DECISION TREE MATCHING ZAKOŃCZONY!")
    print("=" * 60)


#     ==================================================
# 📈 KROK 5: GENEROWANIE RAPORTÓW
# ==================================================
# 🎯 FINAL RESULTS:
#    📚 Hoteli referencyjnych: 99
#    📡 Hoteli z API: 95,463
#    ✅ Exact matches: 11
#    🧠 Fuzzy matches: 37
#    🎯 Total matches: 48
#    📊 Coverage: 48.5%

# 📊 CONFIDENCE DISTRIBUTION:
#    🟢 High confidence (≥0.9): 20
#    🟡 Medium confidence (0.7-0.9): 27
#    🔴 Low confidence (<0.7): 1

# Zapisywanie wyników...
# Zapisano: hotel_mapping_reference_with_iso.csv
# Zapisano: hotel_mapping_matches.csv
# Zapisano: hotel_mapping_summary.csv

# Coverage: 48.5% (48/99)

🚀 HOTEL MAPPING ENGINE - SIMPLE DECISION TREE VERSION 🚀
Mapowanie hoteli premium do Rate Hawk API
Algorytm: Exact Match + Custom Decision Tree + Fuzzy Features

📂 KROK 1A: ŁADOWANIE LISTY REFERENCYJNEJ
✅ Załadowano plik: lista_referencyjna.csv
📊 Znaleziono 99 hoteli w pliku

🧹 Czyszczenie i normalizacja danych...
🌍 Ekstrakcja krajów i miast...
🏳️ Mapowanie kodów krajów...
📈 Top 5 krajów:
   Greece: 13 hoteli
   Maldives: 12 hoteli
   United Arab Emirates: 11 hoteli
   Spain: 8 hoteli
   Thailand: 7 hoteli
✂️ Normalizacja nazw hoteli...
✅ Przetworzono 99 hoteli referencyjnych

📋 Przykłady normalizacji nazw:
   'Regnum Carya Golf & Spa Resort, Turcja' → 'regnum carya golf turcja'
   'SH Valencia Palace, Walencja' → 'sh valencia walencja'
   'Atlantis The Royal' → 'atlantis the'

📡 KROK 1B: ŁADOWANIE DANYCH Z RATE HAWK API
✅ Załadowano plik: 01_api_rate_hawk.csv
📊 Znaleziono 95463 hoteli z API

🧹 Czyszczenie danych API...
🌍 Top 5 krajów w API:
   AE: 12942 hoteli
   GR: 8592 hoteli
   TH:

In [10]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
import re
import phonetics
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class ISOHotelMatcher:
    def __init__(self):
        self.reference_hotels = None
        self.api_hotels = None
        self.iso_mappings = self._create_iso_mappings()
        self.country_to_iso = self._create_reverse_iso_lookup()
        self.luxury_brands = self._create_luxury_brands()
        
    def _create_iso_mappings(self) -> Dict[str, Dict]:
        return {
            'AE': {
                'iso_code': 'AE',
                'official_name': 'United Arab Emirates',
                'variants': ['UAE', 'U.A.E', 'United Arab Emirates', 'Emiraty Arabskie', 'Zjednoczone Emiraty Arabskie']
            },
            'TH': {
                'iso_code': 'TH',
                'official_name': 'Thailand', 
                'variants': ['Thailand', 'Tajlandia', 'Siam', 'Kingdom of Thailand']
            },
            'SC': {
                'iso_code': 'SC',
                'official_name': 'Seychelles',
                'variants': ['Seychelles', 'Seszele', 'Republic of Seychelles', 'Sesel']
            },
            'TR': {
                'iso_code': 'TR',
                'official_name': 'Turkey',
                'variants': ['Turkey', 'Turcja', 'Türkiye', 'Republic of Turkey']
            },
            'ES': {
                'iso_code': 'ES', 
                'official_name': 'Spain',
                'variants': ['Spain', 'Hiszpania', 'España', 'Kingdom of Spain']
            },
            'ME': {
                'iso_code': 'ME',
                'official_name': 'Montenegro',
                'variants': ['Montenegro', 'Czarnogóra', 'Crna Gora']
            },
            'IT': {
                'iso_code': 'IT',
                'official_name': 'Italy',
                'variants': ['Italy', 'Włochy', 'Italia', 'Italian Republic']
            },
            'MY': {
                'iso_code': 'MY',
                'official_name': 'Malaysia',
                'variants': ['Malaysia', 'Malezja']
            },
            'MV': {
                'iso_code': 'MV',
                'official_name': 'Maldives',
                'variants': ['Maldives', 'Malediwy', 'Republic of Maldives']
            },
            'GR': {
                'iso_code': 'GR',
                'official_name': 'Greece',
                'variants': ['Greece', 'Grecja', 'Hellas', 'Hellenic Republic']
            },
            'MA': {
                'iso_code': 'MA',
                'official_name': 'Morocco',
                'variants': ['Morocco', 'Maroko', 'Kingdom of Morocco']
            },
            'EG': {
                'iso_code': 'EG',
                'official_name': 'Egypt',
                'variants': ['Egypt', 'Egipt', 'Arab Republic of Egypt']
            },
            'MU': {
                'iso_code': 'MU',
                'official_name': 'Mauritius',
                'variants': ['Mauritius', 'Republic of Mauritius']
            },
            'GB': {
                'iso_code': 'GB',
                'official_name': 'United Kingdom',
                'variants': ['United Kingdom', 'Wielka Brytania', 'UK', 'Great Britain']
            },
            'PT': {
                'iso_code': 'PT',
                'official_name': 'Portugal',
                'variants': ['Portugal', 'Portugalia', 'Portuguese Republic']
            },
            'CH': {
                'iso_code': 'CH',
                'official_name': 'Switzerland',
                'variants': ['Switzerland', 'Szwajcaria', 'Swiss Confederation']
            },
            'AU': {
                'iso_code': 'AU',
                'official_name': 'Australia',
                'variants': ['Australia', 'Commonwealth of Australia']
            },
            'ID': {
                'iso_code': 'ID',
                'official_name': 'Indonesia',
                'variants': ['Indonesia', 'Indonezja', 'Republic of Indonesia']
            },
            'AW': {
                'iso_code': 'AW',
                'official_name': 'Aruba',
                'variants': ['Aruba']
            },
            'GL': {
                'iso_code': 'GL',
                'official_name': 'Greenland',
                'variants': ['Greenland', 'Grenlandia']
            },
            'VN': {
                'iso_code': 'VN',
                'official_name': 'Vietnam',
                'variants': ['Vietnam', 'Wietnam', 'Socialist Republic of Vietnam']
            },
            'FR': {
                'iso_code': 'FR',
                'official_name': 'France',
                'variants': ['France', 'Francja', 'French Republic']
            },
            'US': {
                'iso_code': 'US',
                'official_name': 'United States',
                'variants': ['United States', 'USA', 'US', 'America']
            },
            'SG': {
                'iso_code': 'SG',
                'official_name': 'Singapore',
                'variants': ['Singapore', 'Singapur', 'Republic of Singapore']
            },
            'IS': {
                'iso_code': 'IS',
                'official_name': 'Iceland',
                'variants': ['Iceland', 'Islandia', 'Republic of Iceland']
            },
            'QA': {
                'iso_code': 'QA',
                'official_name': 'Qatar',
                'variants': ['Qatar', 'State of Qatar']
            },
            'ZA': {
                'iso_code': 'ZA',
                'official_name': 'South Africa',
                'variants': ['South Africa', 'RPA', 'Republic of South Africa']
            },
            'DO': {
                'iso_code': 'DO',
                'official_name': 'Dominican Republic',
                'variants': ['Dominican Republic', 'Dominikana']
            },
            'CL': {
                'iso_code': 'CL',
                'official_name': 'Chile',
                'variants': ['Chile', 'Republic of Chile', 'Explora']
            }
        }
    
    def _create_reverse_iso_lookup(self) -> Dict[str, str]:
        lookup = {}
        for iso_code, data in self.iso_mappings.items():
            for variant in data['variants']:
                lookup[variant.lower().strip()] = iso_code
        return lookup
    
    def _create_luxury_brands(self) -> List[str]:
        return [
            'four seasons', 'atlantis', 'banyan tree', 'mandarin oriental',
            'one only', 'angsana', 'shangri-la', 'ritz carlton', 'ritz-carlton',
            'st regis', 'waldorf astoria', 'hilton', 'marriott', 'hyatt',
            'intercontinental', 'sheraton', 'westin', 'doubletree', 'regent',
            'fairmont', 'raffles', 'rosewood', 'belmond', 'six senses',
            'chedi', 'dusit thani', 'movenpick', 'mövenpick'
        ]
    
    def _map_country_to_iso(self, country_str: str) -> Optional[str]:
        if not country_str:
            return None
        
        country_clean = country_str.lower().strip()
        return self.country_to_iso.get(country_clean)
    
    def _extract_brand(self, hotel_name: str) -> Optional[str]:
        if not hotel_name:
            return None
        
        name_lower = hotel_name.lower()
        for brand in self.luxury_brands:
            if brand in name_lower:
                return brand
        return None
    
    def _normalize_hotel_name(self, name: str) -> str:
        if pd.isna(name) or not name:
            return ""
        
        name = str(name).lower().strip()
        name = re.sub(r'[^\w\s]', ' ', name)
        
        stop_words = [
            'hotel', 'resort', 'spa', 'suites', 'inn', 'lodge', 
            'motel', 'palace', 'club', 'grand', 'royal', 'the', 'at'
        ]
        
        words = [word for word in name.split() if word not in stop_words and len(word) > 1]
        
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        
        return ' '.join(unique_words)
    
    def _normalize_city_name(self, city: str) -> str:
        if not city:
            return ""
        
        city_mappings = {
            'dubaj': 'dubai',
            'krabi': 'krabi',
            'desroches': 'desroches island'
        }
        
        city_clean = city.lower().strip()
        return city_mappings.get(city_clean, city_clean)
    
    def load_reference_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📂 LOADING REFERENCE HOTELS")
        print("="*60)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Loaded file: {csv_path}")
            print(f"📊 Found {len(df)} reference hotels")
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            return None
        
        print("\n🧹 Processing reference data...")
        df['clean_location'] = df['Lokalizacja'].str.strip()
        df['clean_hotel'] = df['Hotel'].str.strip()
        
        df[['country_raw', 'city_raw']] = df['clean_location'].str.split(',', n=1, expand=True)
        df['country_raw'] = df['country_raw'].fillna('').str.strip()
        df['city_raw'] = df['city_raw'].fillna('').str.strip()
        
        print("🌍 Mapping countries to ISO codes...")
        df['country_iso'] = df['country_raw'].apply(self._map_country_to_iso)
        
        unmapped_countries = df[df['country_iso'].isna()]['country_raw'].unique()
        if len(unmapped_countries) > 0:
            print(f"⚠️  Unmapped countries: {list(unmapped_countries)}")
        
        df['city'] = df['city_raw'].apply(self._normalize_city_name)
        df['normalized_name'] = df['clean_hotel'].apply(self._normalize_hotel_name)
        df['brand'] = df['clean_hotel'].apply(self._extract_brand)
        df['reference_id'] = df.index.astype(str).str.zfill(3)
        
        country_stats = df['country_iso'].value_counts()
        print(f"📈 Countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count} hotels")
        
        brands_found = df['brand'].dropna().nunique()
        print(f"🏨 Luxury brands detected: {brands_found}")
        
        self.reference_hotels = df
        print(f"✅ Processed {len(df)} reference hotels")
        return df
    
    def load_api_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📡 LOADING API HOTELS")
        print("="*60)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Loaded file: {csv_path}")
            print(f"📊 Found {len(df):,} API hotels")
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            return None
        
        print("\n🧹 Processing API data...")
        df['clean_name'] = df['name'].fillna('').str.strip()
        df['clean_city'] = df['city'].fillna('').str.strip()
        df['clean_address'] = df['address'].fillna('').str.strip()
        df['clean_chain'] = df['hotel_chain'].fillna('').str.strip()
        
        df['country_iso'] = df['country'].fillna('').str.upper().str.strip()
        df['normalized_name'] = df['clean_name'].apply(self._normalize_hotel_name)
        df['city_normalized'] = df['clean_city'].apply(self._normalize_city_name)
        
        df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['lng'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        country_stats = df['country_iso'].value_counts()
        print(f"🌍 API countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count:,} hotels")
        
        coords_valid = df[['lat', 'lng']].notna().all(axis=1).sum()
        print(f"📍 Hotels with coordinates: {coords_valid:,}/{len(df):,} ({coords_valid/len(df)*100:.1f}%)")
        
        chain_stats = df[df['clean_chain'] != '']['clean_chain'].value_counts()
        print(f"🏨 Top 3 hotel chains:")
        for chain, count in chain_stats.head(3).items():
            print(f"   {chain}: {count:,} hotels")
        
        self.api_hotels = df
        print(f"✅ Processed {len(df):,} API hotels")
        return df
    
    def _calculate_features(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return self._empty_features()
        
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'country_exact': ref_hotel['country_iso'] == api_hotel['country_iso'],
            'city_similarity': self._city_similarity(ref_hotel['city'], api_hotel['city_normalized']),
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        return features
    
    def _empty_features(self) -> Dict:
        return {
            'fuzz_ratio': 0.0, 'fuzz_partial': 0.0, 'fuzz_token_sort': 0.0,
            'fuzz_token_set': 0.0, 'country_exact': False, 'city_similarity': 0.0,
            'brand_exact': False, 'brand_similarity': 0.0, 'soundex_match': False,
            'word_intersection': 0.0, 'premium_keywords': 0.0
        }
    
    def _city_similarity(self, city1: str, city2: str) -> float:
        if not city1 or not city2:
            return 0.0
        return fuzz.ratio(city1.lower(), city2.lower()) / 100.0
    
    def _brand_exact_match(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> bool:
        ref_brand = ref_hotel.get('brand')
        api_chain = api_hotel.get('clean_chain', '').lower()
        
        if not ref_brand or not api_chain:
            return False
        
        return ref_brand.lower() in api_chain or api_chain in ref_brand.lower()
    
    def _brand_similarity(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> float:
        ref_brand = ref_hotel.get('brand', '')
        api_chain = api_hotel.get('clean_chain', '')
        
        if not ref_brand or not api_chain:
            return 0.0
        
        return fuzz.ratio(ref_brand, api_chain) / 100.0
    
    def _soundex_match(self, name1: str, name2: str) -> bool:
        if not name1 or not name2:
            return False
        
        try:
            soundex1 = phonetics.soundex(name1.replace(' ', ''))
            soundex2 = phonetics.soundex(name2.replace(' ', ''))
            return soundex1 == soundex2
        except:
            return False
    
    def _word_intersection_ratio(self, name1: str, name2: str) -> float:
        if not name1 or not name2:
            return 0.0
        
        words1 = set(name1.lower().split())
        words2 = set(name2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _premium_keywords_overlap(self, name1: str, name2: str) -> float:
        premium_keywords = ['royal', 'palace', 'luxury', 'exclusive', 'premium', 'collection', 'reserve']
        
        name1_lower = name1.lower()
        name2_lower = name2.lower()
        
        name1_keywords = sum(1 for kw in premium_keywords if kw in name1_lower)
        name2_keywords = sum(1 for kw in premium_keywords if kw in name2_lower)
        
        if name1_keywords == 0 and name2_keywords == 0:
            return 0.0
        
        common_keywords = sum(1 for kw in premium_keywords if kw in name1_lower and kw in name2_lower)
        total_keywords = max(name1_keywords, name2_keywords)
        
        return common_keywords / total_keywords if total_keywords > 0 else 0.0
    
    def _high_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['brand_exact'] and features['country_exact'] and features['city_similarity'] > 0.8:
            return {
                'match': True,
                'confidence': 0.95,
                'reason': 'Perfect brand + location'
            }
        
        if features['fuzz_token_sort'] > 0.92 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.90,
                'reason': f'Very high token sort ({features["fuzz_token_sort"]:.2f}) + country'
            }
        
        if features['fuzz_token_set'] > 0.90 and features['country_exact'] and features['city_similarity'] > 0.7:
            return {
                'match': True,
                'confidence': 0.88,
                'reason': f'High token set ({features["fuzz_token_set"]:.2f}) + location'
            }
        
        return None
    
    def _medium_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['brand_similarity'] > 0.8 and features['fuzz_token_sort'] > 0.75 and features['country_exact']:
            confidence = 0.70 + (features['fuzz_token_sort'] - 0.75) * 0.4
            return {
                'match': True,
                'confidence': min(confidence, 0.85),
                'reason': f'Brand + good name similarity ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['fuzz_partial'] > 0.85 and features['country_exact'] and (features['brand_similarity'] > 0.6 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'High partial ({features["fuzz_partial"]:.2f}) + indicators'
            }
        
        if features['fuzz_token_set'] > 0.80 and features['country_exact'] and features['city_similarity'] > 0.6:
            confidence = 0.65 + (features['fuzz_token_set'] - 0.80) * 0.5
            return {
                'match': True,
                'confidence': min(confidence, 0.82),
                'reason': f'Token set ({features["fuzz_token_set"]:.2f}) + location'
            }
        
        return None
    
    def _lower_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['soundex_match'] and features['fuzz_partial'] > 0.75 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.60,
                'reason': f'Phonetic + partial ({features["fuzz_partial"]:.2f})'
            }
        
        if (features['word_intersection'] > 0.6 and features['premium_keywords'] > 0.4 and 
            features['country_exact'] and features['city_similarity'] > 0.5):
            return {
                'match': True,
                'confidence': 0.55,
                'reason': f'Word intersection ({features["word_intersection"]:.2f}) + premium + location'
            }
        
        return None
    
    def _make_matching_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        features = self._calculate_features(ref_hotel, api_hotel)
        
        result = self._high_confidence_rules(features)
        if result:
            return result
        
        result = self._medium_confidence_rules(features)
        if result:
            return result
        
        result = self._lower_confidence_rules(features)
        if result:
            return result
        
        return None
    
    def run_matching(self) -> List[Dict]:
        print("\n" + "="*60)
        print("🎯 RUNNING HOTEL MATCHING")
        print("="*60)
        
        matches = []
        api_by_country = self.api_hotels.groupby('country_iso')
        total_hotels = len(self.reference_hotels)
        
        print(f"📊 Processing {total_hotels} reference hotels")
        print(f"🔧 ISO pre-filtering active - grouping API by country")
        print(f"🌍 API countries available: {len(api_by_country.groups)}")
        
        skipped_no_country = 0
        skipped_no_api_country = 0
        total_comparisons = 0
        
        for idx, (_, ref_hotel) in enumerate(self.reference_hotels.iterrows(), 1):
            ref_iso = ref_hotel['country_iso']
            ref_name = ref_hotel['clean_hotel']
            
            print(f"\n[{idx:2d}/{total_hotels}] Processing: {ref_name[:50]}...")
            print(f"    🌍 Country: {ref_iso}")
            
            if not ref_iso:
                print(f"    ❌ No ISO code mapping - skipped")
                skipped_no_country += 1
                continue
            
            if ref_iso not in api_by_country.groups:
                print(f"    ❌ No API hotels for {ref_iso} - skipped") 
                skipped_no_api_country += 1
                continue
            
            candidates = api_by_country.get_group(ref_iso)
            print(f"    📍 Candidates after ISO filter: {len(candidates):,}")
            
            best_match = None
            best_confidence = 0.0
            candidates_checked = 0
            
            for _, api_hotel in candidates.iterrows():
                candidates_checked += 1
                total_comparisons += 1
                
                result = self._make_matching_decision(ref_hotel, api_hotel)
                
                if result and result['confidence'] > best_confidence:
                    best_confidence = result['confidence']
                    best_match = {
                        'reference_id': ref_hotel['reference_id'],
                        'reference_name': ref_hotel['clean_hotel'],
                        'api_id': api_hotel['id'],
                        'api_name': api_hotel['clean_name'],
                        'api_chain': api_hotel['clean_chain'],
                        'api_city': api_hotel['clean_city'],
                        'api_address': api_hotel['clean_address'],
                        'api_latitude': api_hotel.get('lat'),
                        'api_longitude': api_hotel.get('lng'),
                        'confidence': result['confidence'],
                        'match_reason': result['reason']
                    }
                
                if candidates_checked % 1000 == 0:
                    print(f"    ... checked {candidates_checked:,}/{len(candidates):,} candidates")
            
            if best_match and best_confidence >= 0.55:
                matches.append(best_match)
                confidence_level = "🟢 HIGH" if best_confidence >= 0.85 else "🟡 MEDIUM" if best_confidence >= 0.70 else "🔴 LOW"
                print(f"    ✅ MATCH FOUND: {best_match['api_name'][:40]}...")
                print(f"    📊 Confidence: {best_confidence:.3f} ({confidence_level})")
                print(f"    🎯 Reason: {best_match['match_reason']}")
            else:
                print(f"    ❌ No match found (best confidence: {best_confidence:.3f})")
        
        print(f"\n" + "="*60)
        print("🎯 MATCHING COMPLETED")
        print("="*60)
        print(f"✅ Total matches found: {len(matches)}")
        print(f"📊 Coverage: {len(matches)}/{total_hotels} ({len(matches)/total_hotels*100:.1f}%)")
        print(f"⚡ Performance stats:")
        print(f"   📊 Total comparisons: {total_comparisons:,}")
        print(f"   🚫 Skipped (no ISO): {skipped_no_country}")
        print(f"   🚫 Skipped (no API country): {skipped_no_api_country}")
        
        if matches:
            confidences = [m['confidence'] for m in matches]
            high_conf = sum(1 for c in confidences if c >= 0.85)
            med_conf = sum(1 for c in confidences if 0.70 <= c < 0.85)
            low_conf = sum(1 for c in confidences if c < 0.70)
            
            print(f"📈 Confidence distribution:")
            print(f"   🟢 High (≥0.85): {high_conf}")
            print(f"   🟡 Medium (0.70-0.84): {med_conf}")
            print(f"   🔴 Low (<0.70): {low_conf}")
        
        return matches
    
    def create_full_reference_with_results(self, matches: List[Dict]) -> pd.DataFrame:
        results_df = self.reference_hotels[['Lokalizacja', 'Hotel', 'reference_id']].copy()
        
        match_dict = {match['reference_id']: match for match in matches}
        
        results_df['api_id'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_id', ''))
        results_df['api_name'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_name', ''))
        results_df['api_chain'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_chain', ''))
        results_df['api_city'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_city', ''))
        results_df['api_address'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_address', ''))
        results_df['api_latitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_latitude', ''))
        results_df['api_longitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_longitude', ''))
        results_df['confidence'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('confidence', ''))
        results_df['match_reason'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_reason', ''))
        results_df['matched'] = results_df['api_id'] != ''
        
        return results_df
    
    def run_complete_matching(self, reference_csv: str, api_csv: str) -> Dict:
        print("🚀" + "="*58 + "🚀")
        print("🚀 HOTEL MAPPING ENGINE - ISO OPTIMIZED VERSION 🚀")
        print("🚀" + "="*58 + "🚀")
        print("Algorithm: ISO Pre-filtering + Rule-based Decision Tree")
        print("Features: Brand extraction + Geographic + String similarity")
        
        self.load_reference_hotels(reference_csv)
        if self.reference_hotels is None:
            return {'error': 'Failed to load reference hotels'}
            
        self.load_api_hotels(api_csv)
        if self.api_hotels is None:
            return {'error': 'Failed to load API hotels'}
        
        matches = self.run_matching()
        
        results = {
            'total_reference_hotels': len(self.reference_hotels),
            'total_api_hotels': len(self.api_hotels),
            'total_matches': len(matches),
            'coverage_percentage': len(matches) / len(self.reference_hotels) * 100,
            'matches': matches
        }
        
        return results
    
    def save_results(self, results: Dict, output_prefix: str = "hotel_mapping_iso"):
        print("\n" + "="*60)
        print("💾 SAVING RESULTS")
        print("="*60)
        
        matches = results['matches']
        
        if matches:
            matches_df = pd.DataFrame(matches)
            matches_file = f"{output_prefix}_matches.csv"
            matches_df.to_csv(matches_file, index=False)
            print(f"✅ Saved matches: {matches_file}")
        
        full_results = self.create_full_reference_with_results(matches)
        full_file = f"{output_prefix}_full_reference.csv"
        full_results.to_csv(full_file, index=False)
        print(f"✅ Saved full reference: {full_file}")
        
        summary = pd.DataFrame({
            'metric': ['Total Reference Hotels', 'Total API Hotels', 'Total Matches', 'Coverage Percentage'],
            'value': [results['total_reference_hotels'], results['total_api_hotels'], 
                     results['total_matches'], f"{results['coverage_percentage']:.1f}%"]
        })
        summary_file = f"{output_prefix}_summary.csv"
        summary.to_csv(summary_file, index=False)
        print(f"✅ Saved summary: {summary_file}")
        
        print(f"\n🎯 FINAL RESULTS:")
        print(f"   📚 Reference hotels: {results['total_reference_hotels']}")
        print(f"   📡 API hotels: {results['total_api_hotels']:,}")
        print(f"   ✅ Matches found: {results['total_matches']}")
        print(f"   📊 Coverage: {results['coverage_percentage']:.1f}%")


if __name__ == "__main__":
    matcher = ISOHotelMatcher()
    
    results = matcher.run_complete_matching(
        reference_csv="lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"
    )
    
    matcher.save_results(results)
    
    print("\n" + "="*60)
    print("🏁 HOTEL MAPPING COMPLETED!")
    print("="*60)

🚀 HOTEL MAPPING ENGINE - ISO OPTIMIZED VERSION 🚀
Algorithm: ISO Pre-filtering + Rule-based Decision Tree
Features: Brand extraction + Geographic + String similarity

📂 LOADING REFERENCE HOTELS
✅ Loaded file: lista_referencyjna.csv
📊 Found 99 reference hotels

🧹 Processing reference data...
🌍 Mapping countries to ISO codes...
⚠️  Unmapped countries: ['The Ritz-Carlton Yacht Collection']
📈 Countries distribution:
   GR (Greece): 13 hotels
   MV (Maldives): 12 hotels
   AE (United Arab Emirates): 11 hotels
   ES (Spain): 8 hotels
   TH (Thailand): 7 hotels
🏨 Luxury brands detected: 23
✅ Processed 99 reference hotels

📡 LOADING API HOTELS
✅ Loaded file: 01_api_rate_hawk.csv
📊 Found 95,463 API hotels

🧹 Processing API data...
🌍 API countries distribution:
   AE (United Arab Emirates): 12,942 hotels
   GR (Greece): 8,592 hotels
   TH (Thailand): 8,063 hotels
   US (United States): 7,858 hotels
   ES (Spain): 6,906 hotels
📍 Hotels with coordinates: 95,463/95,463 (100.0%)
🏨 Top 3 hotel chains:

In [12]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
import re
import phonetics
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class ISOHotelMatcher:
    def __init__(self):
        self.reference_hotels = None
        self.api_hotels = None
        self.iso_mappings = self._create_iso_mappings()
        self.country_to_iso = self._create_reverse_iso_lookup()
        self.luxury_brands = self._create_luxury_brands()
        
    def _create_iso_mappings(self) -> Dict[str, Dict]:
        return {
            'AE': {
                'iso_code': 'AE',
                'official_name': 'United Arab Emirates',
                'variants': ['UAE', 'U.A.E', 'United Arab Emirates', 'Emiraty Arabskie', 'Zjednoczone Emiraty Arabskie']
            },
            'TH': {
                'iso_code': 'TH',
                'official_name': 'Thailand', 
                'variants': ['Thailand', 'Tajlandia', 'Siam', 'Kingdom of Thailand']
            },
            'SC': {
                'iso_code': 'SC',
                'official_name': 'Seychelles',
                'variants': ['Seychelles', 'Seszele', 'Republic of Seychelles', 'Sesel']
            },
            'TR': {
                'iso_code': 'TR',
                'official_name': 'Turkey',
                'variants': ['Turkey', 'Turcja', 'Türkiye', 'Republic of Turkey']
            },
            'ES': {
                'iso_code': 'ES', 
                'official_name': 'Spain',
                'variants': ['Spain', 'Hiszpania', 'España', 'Kingdom of Spain']
            },
            'ME': {
                'iso_code': 'ME',
                'official_name': 'Montenegro',
                'variants': ['Montenegro', 'Czarnogóra', 'Crna Gora']
            },
            'IT': {
                'iso_code': 'IT',
                'official_name': 'Italy',
                'variants': ['Italy', 'Włochy', 'Italia', 'Italian Republic']
            },
            'MY': {
                'iso_code': 'MY',
                'official_name': 'Malaysia',
                'variants': ['Malaysia', 'Malezja']
            },
            'MV': {
                'iso_code': 'MV',
                'official_name': 'Maldives',
                'variants': ['Maldives', 'Malediwy', 'Republic of Maldives']
            },
            'GR': {
                'iso_code': 'GR',
                'official_name': 'Greece',
                'variants': ['Greece', 'Grecja', 'Hellas', 'Hellenic Republic']
            },
            'MA': {
                'iso_code': 'MA',
                'official_name': 'Morocco',
                'variants': ['Morocco', 'Maroko', 'Kingdom of Morocco']
            },
            'EG': {
                'iso_code': 'EG',
                'official_name': 'Egypt',
                'variants': ['Egypt', 'Egipt', 'Arab Republic of Egypt']
            },
            'MU': {
                'iso_code': 'MU',
                'official_name': 'Mauritius',
                'variants': ['Mauritius', 'Republic of Mauritius']
            },
            'GB': {
                'iso_code': 'GB',
                'official_name': 'United Kingdom',
                'variants': ['United Kingdom', 'Wielka Brytania', 'UK', 'Great Britain']
            },
            'PT': {
                'iso_code': 'PT',
                'official_name': 'Portugal',
                'variants': ['Portugal', 'Portugalia', 'Portuguese Republic']
            },
            'CH': {
                'iso_code': 'CH',
                'official_name': 'Switzerland',
                'variants': ['Switzerland', 'Szwajcaria', 'Swiss Confederation']
            },
            'AU': {
                'iso_code': 'AU',
                'official_name': 'Australia',
                'variants': ['Australia', 'Commonwealth of Australia']
            },
            'ID': {
                'iso_code': 'ID',
                'official_name': 'Indonesia',
                'variants': ['Indonesia', 'Indonezja', 'Republic of Indonesia']
            },
            'AW': {
                'iso_code': 'AW',
                'official_name': 'Aruba',
                'variants': ['Aruba']
            },
            'GL': {
                'iso_code': 'GL',
                'official_name': 'Greenland',
                'variants': ['Greenland', 'Grenlandia']
            },
            'VN': {
                'iso_code': 'VN',
                'official_name': 'Vietnam',
                'variants': ['Vietnam', 'Wietnam', 'Socialist Republic of Vietnam']
            },
            'FR': {
                'iso_code': 'FR',
                'official_name': 'France',
                'variants': ['France', 'Francja', 'French Republic']
            },
            'US': {
                'iso_code': 'US',
                'official_name': 'United States',
                'variants': ['United States', 'USA', 'US', 'America']
            },
            'SG': {
                'iso_code': 'SG',
                'official_name': 'Singapore',
                'variants': ['Singapore', 'Singapur', 'Republic of Singapore']
            },
            'IS': {
                'iso_code': 'IS',
                'official_name': 'Iceland',
                'variants': ['Iceland', 'Islandia', 'Republic of Iceland']
            },
            'QA': {
                'iso_code': 'QA',
                'official_name': 'Qatar',
                'variants': ['Qatar', 'State of Qatar']
            },
            'ZA': {
                'iso_code': 'ZA',
                'official_name': 'South Africa',
                'variants': ['South Africa', 'RPA', 'Republic of South Africa']
            },
            'DO': {
                'iso_code': 'DO',
                'official_name': 'Dominican Republic',
                'variants': ['Dominican Republic', 'Dominikana']
            },
            'CL': {
                'iso_code': 'CL',
                'official_name': 'Chile',
                'variants': ['Chile', 'Republic of Chile', 'Explora']
            }
        }
    
    def _create_reverse_iso_lookup(self) -> Dict[str, str]:
        lookup = {}
        for iso_code, data in self.iso_mappings.items():
            for variant in data['variants']:
                lookup[variant.lower().strip()] = iso_code
        return lookup
    
    def _create_luxury_brands(self) -> List[str]:
        return [
            'four seasons', 'atlantis', 'banyan tree', 'mandarin oriental',
            'one only', 'angsana', 'shangri-la', 'ritz carlton', 'ritz-carlton',
            'st regis', 'waldorf astoria', 'hilton', 'marriott', 'hyatt',
            'intercontinental', 'sheraton', 'westin', 'doubletree', 'regent',
            'fairmont', 'raffles', 'rosewood', 'belmond', 'six senses',
            'chedi', 'dusit thani', 'movenpick', 'mövenpick'
        ]
    
    def _map_country_to_iso(self, country_str: str) -> Optional[str]:
        if not country_str:
            return None
        
        country_clean = country_str.lower().strip()
        return self.country_to_iso.get(country_clean)
    
    def _extract_brand(self, hotel_name: str) -> Optional[str]:
        if not hotel_name:
            return None
        
        name_lower = hotel_name.lower()
        for brand in self.luxury_brands:
            if brand in name_lower:
                return brand
        return None
    
    def _normalize_hotel_name(self, name: str) -> str:
        if pd.isna(name) or not name:
            return ""
        
        name = str(name).lower().strip()
        name = re.sub(r'[^\w\s]', ' ', name)
        
        stop_words = [
            'hotel', 'resort', 'spa', 'suites', 'inn', 'lodge', 
            'motel', 'palace', 'club', 'grand', 'royal', 'the', 'at'
        ]
        
        words = [word for word in name.split() if word not in stop_words and len(word) > 1]
        
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        
        return ' '.join(unique_words)
    
    def _normalize_city_name(self, city: str) -> str:
        if not city:
            return ""
        
        city_mappings = {
            'dubaj': 'dubai',
            'krabi': 'krabi',
            'desroches': 'desroches island'
        }
        
        city_clean = city.lower().strip()
        return city_mappings.get(city_clean, city_clean)
    
    def load_reference_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📂 LOADING REFERENCE HOTELS")
        print("="*60)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Loaded file: {csv_path}")
            print(f"📊 Found {len(df)} reference hotels")
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            return None
        
        print("\n🧹 Processing reference data...")
        df['clean_location'] = df['Lokalizacja'].str.strip()
        df['clean_hotel'] = df['Hotel'].str.strip()
        
        df[['country_raw', 'city_raw']] = df['clean_location'].str.split(',', n=1, expand=True)
        df['country_raw'] = df['country_raw'].fillna('').str.strip()
        df['city_raw'] = df['city_raw'].fillna('').str.strip()
        
        print("🌍 Mapping countries to ISO codes...")
        df['country_iso'] = df['country_raw'].apply(self._map_country_to_iso)
        
        unmapped_countries = df[df['country_iso'].isna()]['country_raw'].unique()
        if len(unmapped_countries) > 0:
            print(f"⚠️  Unmapped countries: {list(unmapped_countries)}")
        
        df['city'] = df['city_raw'].apply(self._normalize_city_name)
        df['normalized_name'] = df['clean_hotel'].apply(self._normalize_hotel_name)
        df['brand'] = df['clean_hotel'].apply(self._extract_brand)
        df['reference_id'] = df.index.astype(str).str.zfill(3)
        
        country_stats = df['country_iso'].value_counts()
        print(f"📈 Countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count} hotels")
        
        brands_found = df['brand'].dropna().nunique()
        print(f"🏨 Luxury brands detected: {brands_found}")
        
        self.reference_hotels = df
        print(f"✅ Processed {len(df)} reference hotels")
        return df
    
    def load_api_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📡 LOADING API HOTELS")
        print("="*60)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Loaded file: {csv_path}")
            print(f"📊 Found {len(df):,} API hotels")
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            return None
        
        print("\n🧹 Processing API data...")
        df['clean_name'] = df['name'].fillna('').str.strip()
        df['clean_city'] = df['city'].fillna('').str.strip()
        df['clean_address'] = df['address'].fillna('').str.strip()
        df['clean_chain'] = df['hotel_chain'].fillna('').str.strip()
        
        df['country_iso'] = df['country'].fillna('').str.upper().str.strip()
        df['normalized_name'] = df['clean_name'].apply(self._normalize_hotel_name)
        df['city_normalized'] = df['clean_city'].apply(self._normalize_city_name)
        
        df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['lng'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        country_stats = df['country_iso'].value_counts()
        print(f"🌍 API countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count:,} hotels")
        
        coords_valid = df[['lat', 'lng']].notna().all(axis=1).sum()
        print(f"📍 Hotels with coordinates: {coords_valid:,}/{len(df):,} ({coords_valid/len(df)*100:.1f}%)")
        
        chain_stats = df[df['clean_chain'] != '']['clean_chain'].value_counts()
        print(f"🏨 Top 3 hotel chains:")
        for chain, count in chain_stats.head(3).items():
            print(f"   {chain}: {count:,} hotels")
        
        self.api_hotels = df
        print(f"✅ Processed {len(df):,} API hotels")
        return df
    
    def _calculate_features(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return self._empty_features()
        
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'country_exact': ref_hotel['country_iso'] == api_hotel['country_iso'],
            'city_similarity': self._city_similarity(ref_hotel['city'], api_hotel['city_normalized']),
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        return features
    
    def _empty_features(self) -> Dict:
        return {
            'fuzz_ratio': 0.0, 'fuzz_partial': 0.0, 'fuzz_token_sort': 0.0,
            'fuzz_token_set': 0.0, 'country_exact': False, 'city_similarity': 0.0,
            'brand_exact': False, 'brand_similarity': 0.0, 'soundex_match': False,
            'word_intersection': 0.0, 'premium_keywords': 0.0
        }
    
    def _city_similarity(self, city1: str, city2: str) -> float:
        if not city1 or not city2:
            return 0.0
        return fuzz.ratio(city1.lower(), city2.lower()) / 100.0
    
    def _brand_exact_match(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> bool:
        ref_brand = ref_hotel.get('brand')
        api_chain = api_hotel.get('clean_chain', '').lower()
        
        if not ref_brand or not api_chain:
            return False
        
        return ref_brand.lower() in api_chain or api_chain in ref_brand.lower()
    
    def _brand_similarity(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> float:
        ref_brand = ref_hotel.get('brand', '')
        api_chain = api_hotel.get('clean_chain', '')
        
        if not ref_brand or not api_chain:
            return 0.0
        
        return fuzz.ratio(ref_brand, api_chain) / 100.0
    
    def _soundex_match(self, name1: str, name2: str) -> bool:
        if not name1 or not name2:
            return False
        
        try:
            soundex1 = phonetics.soundex(name1.replace(' ', ''))
            soundex2 = phonetics.soundex(name2.replace(' ', ''))
            return soundex1 == soundex2
        except:
            return False
    
    def _word_intersection_ratio(self, name1: str, name2: str) -> float:
        if not name1 or not name2:
            return 0.0
        
        words1 = set(name1.lower().split())
        words2 = set(name2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _premium_keywords_overlap(self, name1: str, name2: str) -> float:
        premium_keywords = ['royal', 'palace', 'luxury', 'exclusive', 'premium', 'collection', 'reserve']
        
        name1_lower = name1.lower()
        name2_lower = name2.lower()
        
        name1_keywords = sum(1 for kw in premium_keywords if kw in name1_lower)
        name2_keywords = sum(1 for kw in premium_keywords if kw in name2_lower)
        
        if name1_keywords == 0 and name2_keywords == 0:
            return 0.0
        
        common_keywords = sum(1 for kw in premium_keywords if kw in name1_lower and kw in name2_lower)
        total_keywords = max(name1_keywords, name2_keywords)
        
        return common_keywords / total_keywords if total_keywords > 0 else 0.0
    
    def _high_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['brand_exact'] and features['country_exact'] and features['city_similarity'] > 0.8:
            return {
                'match': True,
                'confidence': 0.95,
                'reason': 'Perfect brand + location'
            }
        
        if features['fuzz_token_sort'] > 0.92 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.90,
                'reason': f'Very high token sort ({features["fuzz_token_sort"]:.2f}) + country'
            }
        
        if features['fuzz_token_set'] > 0.90 and features['country_exact'] and features['city_similarity'] > 0.7:
            return {
                'match': True,
                'confidence': 0.88,
                'reason': f'High token set ({features["fuzz_token_set"]:.2f}) + location'
            }
        
        return None
    
    def _medium_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['brand_similarity'] > 0.8 and features['fuzz_token_sort'] > 0.75 and features['country_exact']:
            confidence = 0.70 + (features['fuzz_token_sort'] - 0.75) * 0.4
            return {
                'match': True,
                'confidence': min(confidence, 0.85),
                'reason': f'Brand + good name similarity ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['fuzz_partial'] > 0.85 and features['country_exact'] and (features['brand_similarity'] > 0.6 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'High partial ({features["fuzz_partial"]:.2f}) + indicators'
            }
        
        if features['fuzz_token_set'] > 0.80 and features['country_exact'] and features['city_similarity'] > 0.6:
            confidence = 0.65 + (features['fuzz_token_set'] - 0.80) * 0.5
            return {
                'match': True,
                'confidence': min(confidence, 0.82),
                'reason': f'Token set ({features["fuzz_token_set"]:.2f}) + location'
            }
        
        return None
    
    def _lower_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['soundex_match'] and features['fuzz_partial'] > 0.75 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.60,
                'reason': f'Phonetic + partial ({features["fuzz_partial"]:.2f})'
            }
        
        if (features['word_intersection'] > 0.6 and features['premium_keywords'] > 0.4 and 
            features['country_exact'] and features['city_similarity'] > 0.5):
            return {
                'match': True,
                'confidence': 0.55,
                'reason': f'Word intersection ({features["word_intersection"]:.2f}) + premium + location'
            }
        
        return None
    
    def _make_name_only_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """
        Name-only matching decision for fallback cases
        More permissive rules when no geography available
        """
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Calculate basic features for name-only matching
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        # Name-only rules (more permissive thresholds)
        
        # Rule 1: Perfect brand match + decent name similarity
        if features['brand_exact'] and features['fuzz_token_sort'] > 0.70:
            return {
                'match': True,
                'confidence': 0.80,
                'reason': 'Perfect brand + decent name (name-only)'
            }
        
        # Rule 2: Very high name similarity
        if features['fuzz_token_sort'] > 0.88:
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Very high token sort ({features["fuzz_token_sort"]:.2f}) (name-only)'
            }
        
        # Rule 3: High token set similarity
        if features['fuzz_token_set'] > 0.85:
            return {
                'match': True,
                'confidence': 0.70,
                'reason': f'High token set ({features["fuzz_token_set"]:.2f}) (name-only)'
            }
        
        # Rule 4: Brand similarity + good name match
        if features['brand_similarity'] > 0.75 and features['fuzz_token_sort'] > 0.65:
            confidence = 0.60 + (features['fuzz_token_sort'] - 0.65) * 0.3
            return {
                'match': True,
                'confidence': min(confidence, 0.75),
                'reason': f'Brand similarity + name match (name-only)'
            }
        
        # Rule 5: High partial ratio + premium indicators
        if features['fuzz_partial'] > 0.80 and (features['brand_similarity'] > 0.5 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.65,
                'reason': f'High partial + indicators (name-only)'
            }
        
        return None
    def _make_name_only_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """
        Name-only matching decision for fallback cases
        More permissive rules when no geography available
        """
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Calculate basic features for name-only matching
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        # Name-only rules (more permissive thresholds)
        
        # Rule 1: Perfect brand match + decent name similarity
        if features['brand_exact'] and features['fuzz_token_sort'] > 0.70:
            return {
                'match': True,
                'confidence': 0.80,
                'reason': 'Perfect brand + decent name (name-only)'
            }
        
        # Rule 2: Very high name similarity
        if features['fuzz_token_sort'] > 0.88:
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Very high token sort ({features["fuzz_token_sort"]:.2f}) (name-only)'
            }
        
        # Rule 3: High token set similarity
        if features['fuzz_token_set'] > 0.85:
            return {
                'match': True,
                'confidence': 0.70,
                'reason': f'High token set ({features["fuzz_token_set"]:.2f}) (name-only)'
            }
        
        # Rule 4: Brand similarity + good name match
        if features['brand_similarity'] > 0.75 and features['fuzz_token_sort'] > 0.65:
            confidence = 0.60 + (features['fuzz_token_sort'] - 0.65) * 0.3
            return {
                'match': True,
                'confidence': min(confidence, 0.75),
                'reason': f'Brand similarity + name match (name-only)'
            }
        
        # Rule 5: High partial ratio + premium indicators
        if features['fuzz_partial'] > 0.80 and (features['brand_similarity'] > 0.5 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.65,
                'reason': f'High partial + indicators (name-only)'
            }
        
        return None
    
    def _make_matching_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        features = self._calculate_features(ref_hotel, api_hotel)
        
        result = self._high_confidence_rules(features)
        if result:
            return result
        
        result = self._medium_confidence_rules(features)
        if result:
            return result
        
        result = self._lower_confidence_rules(features)
        if result:
            return result
        
        return None
    
    def run_matching(self) -> List[Dict]:
        print("\n" + "="*60)
        print("🎯 RUNNING HOTEL MATCHING")
        print("="*60)
        
        matches = []
        api_by_country = self.api_hotels.groupby('country_iso')
        total_hotels = len(self.reference_hotels)
        
        # Algorithm performance tracking
        algorithm_stats = {
            'Perfect brand + location': 0,
            'Very high token sort + country': 0,
            'High token set + location': 0,
            'Brand + good name similarity': 0,
            'High partial + indicators': 0,
            'Token set + location': 0,
            'Phonetic + partial': 0,
            'Word intersection + premium + location': 0,
            'Name-only fallback': 0
        }
        
        print(f"📊 Processing {total_hotels} reference hotels")
        print(f"🔧 ISO pre-filtering active - grouping API by country")
        print(f"🌍 API countries available: {len(api_by_country.groups)}")
        
        skipped_no_country = 0
        skipped_no_api_country = 0
        total_comparisons = 0
        
        for idx, (_, ref_hotel) in enumerate(self.reference_hotels.iterrows(), 1):
            ref_iso = ref_hotel['country_iso']
            ref_name = ref_hotel['clean_hotel']
            
            print(f"\n[{idx:2d}/{total_hotels}] Processing: {ref_name[:50]}...")
            print(f"    🌍 Country: {ref_iso}")
            
            best_match = None
            best_confidence = 0.0
            candidates_checked = 0
            
            # Strategy 1: ISO-based matching (preferred)
            if ref_iso and ref_iso in api_by_country.groups:
                candidates = api_by_country.get_group(ref_iso)
                print(f"    📍 ISO candidates: {len(candidates):,}")
                
                for _, api_hotel in candidates.iterrows():
                    candidates_checked += 1
                    total_comparisons += 1
                    
                    result = self._make_matching_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'ISO-based'
                        }
                    
                    if candidates_checked % 1000 == 0:
                        print(f"    ... checked {candidates_checked:,}/{len(candidates):,} candidates")
            
            # Strategy 2: Name-only fallback (when no ISO or no ISO matches)
            if not best_match or best_confidence < 0.75:
                if not ref_iso:
                    print(f"    ⚠️  No ISO mapping - trying name-only fallback")
                elif ref_iso not in api_by_country.groups:
                    print(f"    ⚠️  No API hotels for {ref_iso} - trying name-only fallback")
                else:
                    print(f"    🔄 Low confidence ({best_confidence:.3f}) - trying name-only fallback")
                
                fallback_candidates = 0
                fallback_checked = 0
                
                # Use top 5000 hotels by similarity for performance
                name_similarities = []
                for _, api_hotel in self.api_hotels.iterrows():
                    if api_hotel['normalized_name']:
                        sim = fuzz.ratio(ref_hotel['normalized_name'], api_hotel['normalized_name']) / 100.0
                        if sim > 0.4:  # Basic threshold
                            name_similarities.append((sim, api_hotel))
                
                # Sort by similarity and take top candidates
                name_similarities.sort(key=lambda x: x[0], reverse=True)
                top_candidates = name_similarities[:5000]  # Limit for performance
                fallback_candidates = len(top_candidates)
                
                print(f"    🔍 Name-only candidates: {fallback_candidates}")
                
                for sim_score, api_hotel in top_candidates:
                    fallback_checked += 1
                    total_comparisons += 1
                    
                    # Use name-only decision logic
                    result = self._make_name_only_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'Name-only fallback'
                        }
                
                if fallback_candidates == 0:
                    if not ref_iso:
                        skipped_no_country += 1
                    else:
                        skipped_no_api_country += 1
            
            # Record result
            if best_match and best_confidence >= 0.55:
                matches.append(best_match)
                
                # Track algorithm performance
                reason = best_match['match_reason']
                algorithm_found = False
                for algo_name in algorithm_stats.keys():
                    if algo_name.lower() in reason.lower():
                        algorithm_stats[algo_name] += 1
                        algorithm_found = True
                        break
                
                if not algorithm_found and best_match['match_strategy'] == 'Name-only fallback':
                    algorithm_stats['Name-only fallback'] += 1
                
                confidence_level = "🟢 HIGH" if best_confidence >= 0.85 else "🟡 MEDIUM" if best_confidence >= 0.70 else "🔴 LOW"
                print(f"    ✅ MATCH FOUND: {best_match['api_name'][:40]}...")
                print(f"    📊 Confidence: {best_confidence:.3f} ({confidence_level})")
                print(f"    🎯 Strategy: {best_match['match_strategy']}")
                print(f"    📝 Reason: {best_match['match_reason']}")
            else:
                print(f"    ❌ No match found (best confidence: {best_confidence:.3f})")
        
        print(f"\n" + "="*60)
        print("🎯 MATCHING COMPLETED")
        print("="*60)
        print(f"✅ Total matches found: {len(matches)}")
        print(f"📊 Coverage: {len(matches)}/{total_hotels} ({len(matches)/total_hotels*100:.1f}%)")
        print(f"⚡ Performance stats:")
        print(f"   📊 Total comparisons: {total_comparisons:,}")
        print(f"   🚫 Skipped (no ISO): {skipped_no_country}")
        print(f"   🚫 Skipped (no API country): {skipped_no_api_country}")
        
        # Algorithm effectiveness breakdown
        print(f"\n🧠 ALGORITHM EFFECTIVENESS:")
        for algo_name, count in algorithm_stats.items():
            if count > 0:
                percentage = (count / len(matches)) * 100 if matches else 0
                print(f"   📈 {algo_name}: {count} matches ({percentage:.1f}%)")
        
        # Strategy breakdown
        if matches:
            iso_matches = sum(1 for m in matches if m['match_strategy'] == 'ISO-based')
            name_matches = sum(1 for m in matches if m['match_strategy'] == 'Name-only fallback')
            
            print(f"\n📊 STRATEGY BREAKDOWN:")
            print(f"   🌍 ISO-based matching: {iso_matches} ({iso_matches/len(matches)*100:.1f}%)")
            print(f"   🔍 Name-only fallback: {name_matches} ({name_matches/len(matches)*100:.1f}%)")
        
        if matches:
            confidences = [m['confidence'] for m in matches]
            high_conf = sum(1 for c in confidences if c >= 0.85)
            med_conf = sum(1 for c in confidences if 0.70 <= c < 0.85)
            low_conf = sum(1 for c in confidences if c < 0.70)
            
            print(f"\n📈 CONFIDENCE DISTRIBUTION:")
            print(f"   🟢 High (≥0.85): {high_conf}")
            print(f"   🟡 Medium (0.70-0.84): {med_conf}")
            print(f"   🔴 Low (<0.70): {low_conf}")
        
        return matches
    
    def create_full_reference_with_results(self, matches: List[Dict]) -> pd.DataFrame:
        results_df = self.reference_hotels[['Lokalizacja', 'Hotel', 'reference_id']].copy()
        
        match_dict = {match['reference_id']: match for match in matches}
        
        results_df['api_id'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_id', ''))
        results_df['api_name'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_name', ''))
        results_df['api_chain'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_chain', ''))
        results_df['api_city'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_city', ''))
        results_df['api_address'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_address', ''))
        results_df['api_latitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_latitude', ''))
        results_df['api_longitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_longitude', ''))
        results_df['confidence'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('confidence', ''))
        results_df['match_reason'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_reason', ''))
        results_df['matched'] = results_df['api_id'] != ''
        
        return results_df
    
    def run_complete_matching(self, reference_csv: str, api_csv: str) -> Dict:
        print("🚀" + "="*58 + "🚀")
        print("🚀 HOTEL MAPPING ENGINE - ISO OPTIMIZED VERSION 🚀")
        print("🚀" + "="*58 + "🚀")
        print("Algorithm: ISO Pre-filtering + Rule-based Decision Tree")
        print("Features: Brand extraction + Geographic + String similarity")
        
        self.load_reference_hotels(reference_csv)
        if self.reference_hotels is None:
            return {'error': 'Failed to load reference hotels'}
            
        self.load_api_hotels(api_csv)
        if self.api_hotels is None:
            return {'error': 'Failed to load API hotels'}
        
        matches = self.run_matching()
        
        results = {
            'total_reference_hotels': len(self.reference_hotels),
            'total_api_hotels': len(self.api_hotels),
            'total_matches': len(matches),
            'coverage_percentage': len(matches) / len(self.reference_hotels) * 100,
            'matches': matches
        }
        
        return results
    
    def save_results(self, results: Dict, output_prefix: str = "hotel_mapping_iso"):
        print("\n" + "="*60)
        print("💾 SAVING RESULTS")
        print("="*60)
        
        matches = results['matches']
        
        if matches:
            matches_df = pd.DataFrame(matches)
            matches_file = f"{output_prefix}_matches.csv"
            matches_df.to_csv(matches_file, index=False)
            print(f"✅ Saved matches: {matches_file}")
        
        full_results = self.create_full_reference_with_results(matches)
        full_file = f"{output_prefix}_full_reference.csv"
        full_results.to_csv(full_file, index=False)
        print(f"✅ Saved full reference: {full_file}")
        
        summary = pd.DataFrame({
            'metric': ['Total Reference Hotels', 'Total API Hotels', 'Total Matches', 'Coverage Percentage'],
            'value': [results['total_reference_hotels'], results['total_api_hotels'], 
                     results['total_matches'], f"{results['coverage_percentage']:.1f}%"]
        })
        summary_file = f"{output_prefix}_summary.csv"
        summary.to_csv(summary_file, index=False)
        print(f"✅ Saved summary: {summary_file}")
        
        print(f"\n🎯 FINAL RESULTS:")
        print(f"   📚 Reference hotels: {results['total_reference_hotels']}")
        print(f"   📡 API hotels: {results['total_api_hotels']:,}")
        print(f"   ✅ Matches found: {results['total_matches']}")
        print(f"   📊 Coverage: {results['coverage_percentage']:.1f}%")


if __name__ == "__main__":
    matcher = ISOHotelMatcher()
    
    results = matcher.run_complete_matching(
        reference_csv="00_api_lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"
    )
    
    matcher.save_results(results)
    
    print("\n" + "="*60)
    print("🏁 HOTEL MAPPING COMPLETED!")
    print("="*60)

🚀 HOTEL MAPPING ENGINE - ISO OPTIMIZED VERSION 🚀
Algorithm: ISO Pre-filtering + Rule-based Decision Tree
Features: Brand extraction + Geographic + String similarity

📂 LOADING REFERENCE HOTELS
✅ Loaded file: 00_api_lista_referencyjna.csv
📊 Found 97 reference hotels

🧹 Processing reference data...
🌍 Mapping countries to ISO codes...
📈 Countries distribution:
   GR (Greece): 13 hotels
   MV (Maldives): 12 hotels
   AE (United Arab Emirates): 11 hotels
   ES (Spain): 8 hotels
   TH (Thailand): 7 hotels
🏨 Luxury brands detected: 23
✅ Processed 97 reference hotels

📡 LOADING API HOTELS
✅ Loaded file: 01_api_rate_hawk.csv
📊 Found 95,463 API hotels

🧹 Processing API data...
🌍 API countries distribution:
   AE (United Arab Emirates): 12,942 hotels
   GR (Greece): 8,592 hotels
   TH (Thailand): 8,063 hotels
   US (United States): 7,858 hotels
   ES (Spain): 6,906 hotels
📍 Hotels with coordinates: 95,463/95,463 (100.0%)
🏨 Top 3 hotel chains:
   No chain: 91,821 hotels
   ZenRooms: 162 hotels
   

In [14]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from typing import Dict, List, Tuple

class APIPatternAnalyzer:
    def __init__(self):
        self.api_data = None
        
    def load_api_data(self, csv_path: str):
        """Load API data for analysis"""
        print("🔍 Loading API data for pattern analysis...")
        self.api_data = pd.read_csv(csv_path)
        print(f"✅ Loaded {len(self.api_data):,} hotels from API")
        
    def analyze_naming_patterns(self, sample_size: int = 10000):
        """Analyze hotel naming patterns in API"""
        print("\n" + "="*60)
        print("📝 HOTEL NAMING PATTERNS ANALYSIS")
        print("="*60)
        
        # Sample for performance
        sample_data = self.api_data.sample(min(sample_size, len(self.api_data)))
        names = sample_data['name'].dropna()
        
        print(f"Analyzing {len(names):,} hotel names...")
        
        # Common words analysis
        all_words = []
        for name in names:
            words = re.findall(r'\b\w+\b', str(name).lower())
            all_words.extend(words)
        
        word_freq = Counter(all_words)
        
        print(f"\n🔤 MOST COMMON WORDS (top 20):")
        for word, count in word_freq.most_common(20):
            percentage = (count / len(all_words)) * 100
            print(f"   {word:<15}: {count:,} ({percentage:.1f}%)")
        
        # Naming patterns
        patterns = {
            'hotel_word': names.str.contains(r'\bhotel\b', case=False, na=False).sum(),
            'resort_word': names.str.contains(r'\bresort\b', case=False, na=False).sum(),
            'spa_word': names.str.contains(r'\bspa\b', case=False, na=False).sum(),
            'palace_word': names.str.contains(r'\bpalace\b', case=False, na=False).sum(),
            'royal_word': names.str.contains(r'\broyal\b', case=False, na=False).sum(),
            'grand_word': names.str.contains(r'\bgrand\b', case=False, na=False).sum(),
            'luxury_word': names.str.contains(r'\bluxury\b', case=False, na=False).sum(),
            'club_word': names.str.contains(r'\bclub\b', case=False, na=False).sum(),
            'suites_word': names.str.contains(r'\bsuites?\b', case=False, na=False).sum(),
            'inn_word': names.str.contains(r'\binn\b', case=False, na=False).sum(),
            'lodge_word': names.str.contains(r'\blodge\b', case=False, na=False).sum(),
            'villa_word': names.str.contains(r'\bvilla\b', case=False, na=False).sum(),
            'apartments': names.str.contains(r'\bapartments?\b', case=False, na=False).sum(),
            'hostel_word': names.str.contains(r'\bhostel\b', case=False, na=False).sum(),
            'guest_house': names.str.contains(r'\bguest.?house\b', case=False, na=False).sum(),
            'bed_breakfast': names.str.contains(r'\bb&b\b|\bbed.?breakfast\b', case=False, na=False).sum(),
        }
        
        print(f"\n🏨 HOTEL TYPE PATTERNS:")
        total_names = len(names)
        for pattern, count in sorted(patterns.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / total_names) * 100
            if count > 0:
                print(f"   {pattern.replace('_', ' ').title():<20}: {count:,} ({percentage:.1f}%)")
    
    def analyze_chain_patterns(self):
        """Analyze hotel chain distribution and patterns"""
        print("\n" + "="*60)
        print("🏢 HOTEL CHAIN PATTERNS ANALYSIS")
        print("="*60)
        
        chains = self.api_data['hotel_chain'].dropna()
        chain_counts = chains.value_counts()
        
        print(f"📊 Total hotels with chain info: {len(chains):,}")
        print(f"📊 Unique chains: {chain_counts.nunique():,}")
        print(f"📊 Hotels without chain: {self.api_data['hotel_chain'].isna().sum():,}")
        
        print(f"\n🏆 TOP 15 HOTEL CHAINS:")
        for chain, count in chain_counts.head(15).items():
            percentage = (count / len(chains)) * 100
            print(f"   {chain:<25}: {count:,} ({percentage:.1f}%)")
        
        # Chain size distribution
        chain_sizes = chain_counts.value_counts().sort_index()
        print(f"\n📈 CHAIN SIZE DISTRIBUTION:")
        print(f"   Single property chains: {(chain_sizes == 1).sum():,}")
        print(f"   2-5 properties: {chain_sizes[(chain_sizes >= 2) & (chain_sizes <= 5)].sum():,}")
        print(f"   6-20 properties: {chain_sizes[(chain_sizes >= 6) & (chain_sizes <= 20)].sum():,}")
        print(f"   21-100 properties: {chain_sizes[(chain_sizes >= 21) & (chain_sizes <= 100)].sum():,}")
        print(f"   100+ properties: {chain_sizes[chain_sizes > 100].sum():,}")
    
    def analyze_geographic_patterns(self):
        """Analyze geographic distribution and patterns"""
        print("\n" + "="*60)
        print("🌍 GEOGRAPHIC PATTERNS ANALYSIS")
        print("="*60)
        
        # Country distribution
        countries = self.api_data['country'].dropna()
        country_counts = countries.value_counts()
        
        print(f"📊 Total countries: {country_counts.nunique()}")
        print(f"\n🌍 TOP 15 COUNTRIES BY HOTEL COUNT:")
        for country, count in country_counts.head(15).items():
            percentage = (count / len(countries)) * 100
            print(f"   {country:<5}: {count:,} ({percentage:.1f}%)")
        
        # City patterns
        cities = self.api_data['city'].dropna()
        city_counts = cities.value_counts()
        
        print(f"\n🏙️  TOP 15 CITIES BY HOTEL COUNT:")
        for city, count in city_counts.head(15).items():
            percentage = (count / len(cities)) * 100
            print(f"   {city:<20}: {count:,} ({percentage:.1f}%)")
        
        # Coordinate coverage
        coords_available = self.api_data[['latitude', 'longitude']].notna().all(axis=1).sum()
        coords_percentage = (coords_available / len(self.api_data)) * 100
        print(f"\n📍 COORDINATE COVERAGE:")
        print(f"   Hotels with coordinates: {coords_available:,} ({coords_percentage:.1f}%)")
        print(f"   Hotels without coordinates: {len(self.api_data) - coords_available:,}")
    
    def analyze_address_patterns(self, sample_size: int = 5000):
        """Analyze address patterns and structure"""
        print("\n" + "="*60)
        print("📮 ADDRESS PATTERNS ANALYSIS")
        print("="*60)
        
        sample_data = self.api_data.sample(min(sample_size, len(self.api_data)))
        addresses = sample_data['address'].dropna()
        
        print(f"Analyzing {len(addresses):,} addresses...")
        
        # Address length distribution
        addr_lengths = addresses.str.len()
        print(f"\n📏 ADDRESS LENGTH STATS:")
        print(f"   Average length: {addr_lengths.mean():.1f} characters")
        print(f"   Median length: {addr_lengths.median():.1f} characters")
        print(f"   Min length: {addr_lengths.min()}")
        print(f"   Max length: {addr_lengths.max()}")
        
        # Common address words
        all_addr_words = []
        for addr in addresses:
            words = re.findall(r'\b\w+\b', str(addr).lower())
            all_addr_words.extend(words)
        
        addr_word_freq = Counter(all_addr_words)
        
        print(f"\n🔤 MOST COMMON ADDRESS WORDS (top 15):")
        for word, count in addr_word_freq.most_common(15):
            if len(word) > 2:  # Skip very short words
                percentage = (count / len(all_addr_words)) * 100
                print(f"   {word:<15}: {count:,} ({percentage:.1f}%)")
        
        # Address patterns
        street_pattern = r'\bstreet\b|\bst\b|\bstr\b'
        avenue_pattern = r'\bavenue\b|\bave\b'
        road_pattern = r'\broad\b|\brd\b'
        boulevard_pattern = r'\bboulevard\b|\bblvd\b'
        number_pattern = r'\d+'
        postal_pattern = r'\b\d{5,6}\b'
        
        addr_patterns = {
            'contains_street': addresses.str.contains(street_pattern, case=False, na=False).sum(),
            'contains_avenue': addresses.str.contains(avenue_pattern, case=False, na=False).sum(),
            'contains_road': addresses.str.contains(road_pattern, case=False, na=False).sum(),
            'contains_boulevard': addresses.str.contains(boulevard_pattern, case=False, na=False).sum(),
            'contains_numbers': addresses.str.contains(number_pattern, na=False).sum(),
            'contains_postal': addresses.str.contains(postal_pattern, na=False).sum(),
        }
        
        print(f"\n🗺️  ADDRESS STRUCTURE PATTERNS:")
        total_addresses = len(addresses)
        for pattern, count in addr_patterns.items():
            percentage = (count / total_addresses) * 100
            print(f"   {pattern.replace('_', ' ').title():<20}: {count:,} ({percentage:.1f}%)")
    
    def analyze_data_quality(self):
        """Analyze data quality and completeness"""
        print("\n" + "="*60)
        print("✅ DATA QUALITY ANALYSIS")
        print("="*60)
        
        total_hotels = len(self.api_data)
        
        # Field completeness
        fields = ['name', 'city', 'country', 'address', 'latitude', 'longitude', 'hotel_chain']
        
        print(f"📊 FIELD COMPLETENESS (total: {total_hotels:,} hotels):")
        for field in fields:
            non_null = self.api_data[field].notna().sum()
            percentage = (non_null / total_hotels) * 100
            empty_str = (self.api_data[field] == '').sum() if self.api_data[field].dtype == 'object' else 0
            effective_data = non_null - empty_str
            effective_percentage = (effective_data / total_hotels) * 100
            
            print(f"   {field:<15}: {effective_data:,} ({effective_percentage:.1f}%) [null: {total_hotels-non_null:,}, empty: {empty_str:,}]")
        
        # Duplicates analysis
        name_duplicates = self.api_data['name'].duplicated().sum()
        id_duplicates = self.api_data['id'].duplicated().sum()
        
        print(f"\n🔍 DUPLICATES ANALYSIS:")
        print(f"   Duplicate names: {name_duplicates:,}")
        print(f"   Duplicate IDs: {id_duplicates:,}")
        
        # Name quality patterns
        names = self.api_data['name'].dropna()
        digit_pattern = r'\d'
        print(f"\n📝 NAME QUALITY PATTERNS:")
        print(f"   Very short names (≤3 chars): {(names.str.len() <= 3).sum():,}")
        print(f"   Very long names (≥100 chars): {(names.str.len() >= 100).sum():,}")
        print(f"   Names with numbers: {names.str.contains(digit_pattern, na=False).sum():,}")
        print(f"   Names all caps: {names.str.isupper().sum():,}")
        print(f"   Names all lowercase: {names.str.islower().sum():,}")
    
    def find_premium_indicators(self, sample_size: int = 10000):
        """Find patterns that indicate premium/luxury hotels"""
        print("\n" + "="*60)
        print("💎 PREMIUM HOTEL INDICATORS ANALYSIS")
        print("="*60)
        
        sample_data = self.api_data.sample(min(sample_size, len(self.api_data)))
        names = sample_data['name'].dropna()
        
        # Premium keywords
        premium_keywords = {
            'luxury': names.str.contains(r'\bluxury\b', case=False, na=False).sum(),
            'premium': names.str.contains(r'\bpremium\b', case=False, na=False).sum(),
            'deluxe': names.str.contains(r'\bdeluxe\b', case=False, na=False).sum(),
            'exclusive': names.str.contains(r'\bexclusive\b', case=False, na=False).sum(),
            'royal': names.str.contains(r'\broyal\b', case=False, na=False).sum(),
            'grand': names.str.contains(r'\bgrand\b', case=False, na=False).sum(),
            'palace': names.str.contains(r'\bpalace\b', case=False, na=False).sum(),
            'imperial': names.str.contains(r'\bimperial\b', case=False, na=False).sum(),
            'signature': names.str.contains(r'\bsignature\b', case=False, na=False).sum(),
            'collection': names.str.contains(r'\bcollection\b', case=False, na=False).sum(),
            'reserve': names.str.contains(r'\breserve\b', case=False, na=False).sum(),
            'retreat': names.str.contains(r'\bretreat\b', case=False, na=False).sum(),
            'estate': names.str.contains(r'\bestate\b', case=False, na=False).sum(),
            'private': names.str.contains(r'\bprivate\b', case=False, na=False).sum(),
            'boutique': names.str.contains(r'\bboutique\b', case=False, na=False).sum(),
        }
        
        print(f"💎 PREMIUM KEYWORDS FREQUENCY:")
        total_names = len(names)
        for keyword, count in sorted(premium_keywords.items(), key=lambda x: x[1], reverse=True):
            if count > 0:
                percentage = (count / total_names) * 100
                print(f"   {keyword.title():<12}: {count:,} ({percentage:.1f}%)")
        
        # Number patterns that might indicate luxury (like "5*", "five star")
        star_patterns = {
            'five_star': names.str.contains(r'\b5\s*star|\bfive\s*star\b', case=False, na=False).sum(),
            'four_star': names.str.contains(r'\b4\s*star|\bfour\s*star\b', case=False, na=False).sum(),
            'stars_symbol': names.str.contains(r'\*+', na=False).sum(),
        }
        
        print(f"\n⭐ STAR RATING PATTERNS:")
        for pattern, count in star_patterns.items():
            if count > 0:
                percentage = (count / total_names) * 100
                print(f"   {pattern.replace('_', ' ').title():<15}: {count:,} ({percentage:.1f}%)")
    
    def run_complete_analysis(self, csv_path: str):
        """Run all analyses"""
        self.load_api_data(csv_path)
        
        self.analyze_naming_patterns()
        self.analyze_chain_patterns()
        self.analyze_geographic_patterns()
        self.analyze_address_patterns()
        self.analyze_data_quality()
        self.find_premium_indicators()
        
        print("\n" + "="*60)
        print("🏁 PATTERN ANALYSIS COMPLETED!")
        print("="*60)
        print("💡 Use these insights to optimize matching algorithms!")

if __name__ == "__main__":
    analyzer = APIPatternAnalyzer()
    analyzer.run_complete_analysis("01_api_rate_hawk.csv")

🔍 Loading API data for pattern analysis...
✅ Loaded 95,463 hotels from API

📝 HOTEL NAMING PATTERNS ANALYSIS
Analyzing 10,000 hotel names...

🔤 MOST COMMON WORDS (top 20):
   apartment      : 1,008 (2.1%)
   hotel          : 930 (1.9%)
   villa          : 854 (1.8%)
   in             : 843 (1.7%)
   beach          : 751 (1.6%)
   apartments     : 661 (1.4%)
   by             : 638 (1.3%)
   the            : 635 (1.3%)
   with           : 634 (1.3%)
   2              : 466 (1.0%)
   view           : 465 (1.0%)
   house          : 425 (0.9%)
   bedroom        : 380 (0.8%)
   resort         : 377 (0.8%)
   1              : 369 (0.8%)
   pool           : 340 (0.7%)
   studio         : 331 (0.7%)
   3              : 316 (0.7%)
   and            : 304 (0.6%)
   luxury         : 270 (0.6%)

🏨 HOTEL TYPE PATTERNS:
   Apartments          : 1,656 (16.6%)
   Hotel Word          : 922 (9.2%)
   Villa Word          : 831 (8.3%)
   Resort Word         : 373 (3.7%)
   Suites Word         : 324 (3.2%)

In [16]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
import re
import phonetics
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class ISOHotelMatcher:
    def __init__(self):
        self.reference_hotels = None
        self.api_hotels = None
        self.iso_mappings = self._create_iso_mappings()
        self.country_to_iso = self._create_reverse_iso_lookup()
        self.luxury_brands = self._create_luxury_brands()
        
    def _create_iso_mappings(self) -> Dict[str, Dict]:
        return {
            'AE': {
                'iso_code': 'AE',
                'official_name': 'United Arab Emirates',
                'variants': ['UAE', 'U.A.E', 'United Arab Emirates', 'Emiraty Arabskie', 'Zjednoczone Emiraty Arabskie']
            },
            'TH': {
                'iso_code': 'TH',
                'official_name': 'Thailand', 
                'variants': ['Thailand', 'Tajlandia', 'Siam', 'Kingdom of Thailand']
            },
            'SC': {
                'iso_code': 'SC',
                'official_name': 'Seychelles',
                'variants': ['Seychelles', 'Seszele', 'Republic of Seychelles', 'Sesel']
            },
            'TR': {
                'iso_code': 'TR',
                'official_name': 'Turkey',
                'variants': ['Turkey', 'Turcja', 'Türkiye', 'Republic of Turkey']
            },
            'ES': {
                'iso_code': 'ES', 
                'official_name': 'Spain',
                'variants': ['Spain', 'Hiszpania', 'España', 'Kingdom of Spain']
            },
            'ME': {
                'iso_code': 'ME',
                'official_name': 'Montenegro',
                'variants': ['Montenegro', 'Czarnogóra', 'Crna Gora']
            },
            'IT': {
                'iso_code': 'IT',
                'official_name': 'Italy',
                'variants': ['Italy', 'Włochy', 'Italia', 'Italian Republic']
            },
            'MY': {
                'iso_code': 'MY',
                'official_name': 'Malaysia',
                'variants': ['Malaysia', 'Malezja']
            },
            'MV': {
                'iso_code': 'MV',
                'official_name': 'Maldives',
                'variants': ['Maldives', 'Malediwy', 'Republic of Maldives']
            },
            'GR': {
                'iso_code': 'GR',
                'official_name': 'Greece',
                'variants': ['Greece', 'Grecja', 'Hellas', 'Hellenic Republic']
            },
            'MA': {
                'iso_code': 'MA',
                'official_name': 'Morocco',
                'variants': ['Morocco', 'Maroko', 'Kingdom of Morocco']
            },
            'EG': {
                'iso_code': 'EG',
                'official_name': 'Egypt',
                'variants': ['Egypt', 'Egipt', 'Arab Republic of Egypt']
            },
            'MU': {
                'iso_code': 'MU',
                'official_name': 'Mauritius',
                'variants': ['Mauritius', 'Republic of Mauritius']
            },
            'GB': {
                'iso_code': 'GB',
                'official_name': 'United Kingdom',
                'variants': ['United Kingdom', 'Wielka Brytania', 'UK', 'Great Britain']
            },
            'PT': {
                'iso_code': 'PT',
                'official_name': 'Portugal',
                'variants': ['Portugal', 'Portugalia', 'Portuguese Republic']
            },
            'CH': {
                'iso_code': 'CH',
                'official_name': 'Switzerland',
                'variants': ['Switzerland', 'Szwajcaria', 'Swiss Confederation']
            },
            'AU': {
                'iso_code': 'AU',
                'official_name': 'Australia',
                'variants': ['Australia', 'Commonwealth of Australia']
            },
            'ID': {
                'iso_code': 'ID',
                'official_name': 'Indonesia',
                'variants': ['Indonesia', 'Indonezja', 'Republic of Indonesia']
            },
            'AW': {
                'iso_code': 'AW',
                'official_name': 'Aruba',
                'variants': ['Aruba']
            },
            'GL': {
                'iso_code': 'GL',
                'official_name': 'Greenland',
                'variants': ['Greenland', 'Grenlandia']
            },
            'VN': {
                'iso_code': 'VN',
                'official_name': 'Vietnam',
                'variants': ['Vietnam', 'Wietnam', 'Socialist Republic of Vietnam']
            },
            'FR': {
                'iso_code': 'FR',
                'official_name': 'France',
                'variants': ['France', 'Francja', 'French Republic']
            },
            'US': {
                'iso_code': 'US',
                'official_name': 'United States',
                'variants': ['United States', 'USA', 'US', 'America']
            },
            'SG': {
                'iso_code': 'SG',
                'official_name': 'Singapore',
                'variants': ['Singapore', 'Singapur', 'Republic of Singapore']
            },
            'IS': {
                'iso_code': 'IS',
                'official_name': 'Iceland',
                'variants': ['Iceland', 'Islandia', 'Republic of Iceland']
            },
            'QA': {
                'iso_code': 'QA',
                'official_name': 'Qatar',
                'variants': ['Qatar', 'State of Qatar']
            },
            'ZA': {
                'iso_code': 'ZA',
                'official_name': 'South Africa',
                'variants': ['South Africa', 'RPA', 'Republic of South Africa']
            },
            'DO': {
                'iso_code': 'DO',
                'official_name': 'Dominican Republic',
                'variants': ['Dominican Republic', 'Dominikana']
            },
            'CL': {
                'iso_code': 'CL',
                'official_name': 'Chile',
                'variants': ['Chile', 'Republic of Chile', 'Explora']
            }
        }
    
    def _create_reverse_iso_lookup(self) -> Dict[str, str]:
        lookup = {}
        for iso_code, data in self.iso_mappings.items():
            for variant in data['variants']:
                lookup[variant.lower().strip()] = iso_code
        return lookup
    
    def _create_luxury_brands(self) -> List[str]:
        return [
            'four seasons', 'atlantis', 'banyan tree', 'mandarin oriental',
            'one only', 'angsana', 'shangri-la', 'ritz carlton', 'ritz-carlton',
            'st regis', 'waldorf astoria', 'hilton', 'marriott', 'hyatt',
            'intercontinental', 'sheraton', 'westin', 'doubletree', 'regent',
            'fairmont', 'raffles', 'rosewood', 'belmond', 'six senses',
            'chedi', 'dusit thani', 'movenpick', 'mövenpick'
        ]
    
    def _map_country_to_iso(self, country_str: str) -> Optional[str]:
        if not country_str:
            return None
        
        country_clean = country_str.lower().strip()
        return self.country_to_iso.get(country_clean)
    
    def _extract_brand(self, hotel_name: str) -> Optional[str]:
        if not hotel_name:
            return None
        
        name_lower = hotel_name.lower()
        for brand in self.luxury_brands:
            if brand in name_lower:
                return brand
        return None
    
    def _normalize_hotel_name(self, name: str) -> str:
        if pd.isna(name) or not name:
            return ""
        
        name = str(name).lower().strip()
        name = re.sub(r'[^\w\s]', ' ', name)
        
        # Enhanced stop words based on API analysis
        stop_words = [
            # Basic hotel types
            'hotel', 'resort', 'spa', 'suites', 'inn', 'lodge', 'motel',
            # Accommodation types (from API analysis)
            'apartment', 'apartments', 'villa', 'house', 'studio', 'bedroom',
            # Descriptors
            'luxury', 'grand', 'royal', 'palace', 'club', 'boutique', 'deluxe',
            # Location words
            'beach', 'view', 'pool', 'marina', 'bay', 'island',
            # Common words
            'in', 'by', 'with', 'and', 'the', 'at', 'of', 'for',
            # Numbers (converted to words later)
            '1', '2', '3', '4', '5', 'one', 'two', 'three', 'four', 'five'
        ]
        
        # Convert numbers to words for better matching
        number_map = {'1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
                      '7': 'seven', '8': 'eight', '9': 'nine', '0': 'zero'}
        for digit, word in number_map.items():
            name = name.replace(f' {digit} ', f' {word} ')
        
        words = [word for word in name.split() if word not in stop_words and len(word) > 1]
        
        # Remove duplicates while preserving order
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        
        return ' '.join(unique_words)
    
    def _normalize_city_name(self, city: str) -> str:
        if not city:
            return ""
        
        city_mappings = {
            'dubaj': 'dubai',
            'krabi': 'krabi',
            'desroches': 'desroches island'
        }
        
        city_clean = city.lower().strip()
        return city_mappings.get(city_clean, city_clean)
    
    def load_reference_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📂 LOADING REFERENCE HOTELS")
        print("="*60)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Loaded file: {csv_path}")
            print(f"📊 Found {len(df)} reference hotels")
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            return None
        
        print("\n🧹 Processing reference data...")
        df['clean_location'] = df['Lokalizacja'].str.strip()
        df['clean_hotel'] = df['Hotel'].str.strip()
        
        df[['country_raw', 'city_raw']] = df['clean_location'].str.split(',', n=1, expand=True)
        df['country_raw'] = df['country_raw'].fillna('').str.strip()
        df['city_raw'] = df['city_raw'].fillna('').str.strip()
        
        print("🌍 Mapping countries to ISO codes...")
        df['country_iso'] = df['country_raw'].apply(self._map_country_to_iso)
        
        unmapped_countries = df[df['country_iso'].isna()]['country_raw'].unique()
        if len(unmapped_countries) > 0:
            print(f"⚠️  Unmapped countries: {list(unmapped_countries)}")
        
        df['city'] = df['city_raw'].apply(self._normalize_city_name)
        df['normalized_name'] = df['clean_hotel'].apply(self._normalize_hotel_name)
        df['brand'] = df['clean_hotel'].apply(self._extract_brand)
        df['reference_id'] = df.index.astype(str).str.zfill(3)
        
        country_stats = df['country_iso'].value_counts()
        print(f"📈 Countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count} hotels")
        
        brands_found = df['brand'].dropna().nunique()
        print(f"🏨 Luxury brands detected: {brands_found}")
        
        self.reference_hotels = df
        print(f"✅ Processed {len(df)} reference hotels")
        return df
    
    def load_api_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📡 LOADING API HOTELS")
        print("="*60)
        
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Loaded file: {csv_path}")
            print(f"📊 Found {len(df):,} API hotels")
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            return None
        
        print("\n🧹 Processing API data...")
        df['clean_name'] = df['name'].fillna('').str.strip()
        df['clean_city'] = df['city'].fillna('').str.strip()
        df['clean_address'] = df['address'].fillna('').str.strip()
        df['clean_chain'] = df['hotel_chain'].fillna('').str.strip()
        
        df['country_iso'] = df['country'].fillna('').str.upper().str.strip()
        df['normalized_name'] = df['clean_name'].apply(self._normalize_hotel_name)
        df['city_normalized'] = df['clean_city'].apply(self._normalize_city_name)
        
        df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['lng'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        country_stats = df['country_iso'].value_counts()
        print(f"🌍 API countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count:,} hotels")
        
        coords_valid = df[['lat', 'lng']].notna().all(axis=1).sum()
        print(f"📍 Hotels with coordinates: {coords_valid:,}/{len(df):,} ({coords_valid/len(df)*100:.1f}%)")
        
        chain_stats = df[df['clean_chain'] != '']['clean_chain'].value_counts()
        print(f"🏨 Top 3 hotel chains:")
        for chain, count in chain_stats.head(3).items():
            print(f"   {chain}: {count:,} hotels")
        
        self.api_hotels = df
        print(f"✅ Processed {len(df):,} API hotels")
        return df
    
    def _calculate_features(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return self._empty_features()
        
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'country_exact': ref_hotel['country_iso'] == api_hotel['country_iso'],
            'city_similarity': self._city_similarity(ref_hotel['city'], api_hotel['city_normalized']),
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        return features
    
    def _empty_features(self) -> Dict:
        return {
            'fuzz_ratio': 0.0, 'fuzz_partial': 0.0, 'fuzz_token_sort': 0.0,
            'fuzz_token_set': 0.0, 'country_exact': False, 'city_similarity': 0.0,
            'brand_exact': False, 'brand_similarity': 0.0, 'soundex_match': False,
            'word_intersection': 0.0, 'premium_keywords': 0.0
        }
    
    def _city_similarity(self, city1: str, city2: str) -> float:
        if not city1 or not city2:
            return 0.0
        return fuzz.ratio(city1.lower(), city2.lower()) / 100.0
    
    def _brand_exact_match(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> bool:
        ref_brand = ref_hotel.get('brand')
        api_chain = api_hotel.get('clean_chain', '').lower()
        
        if not ref_brand or not api_chain:
            return False
        
        return ref_brand.lower() in api_chain or api_chain in ref_brand.lower()
    
    def _brand_similarity(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> float:
        ref_brand = ref_hotel.get('brand', '')
        api_chain = api_hotel.get('clean_chain', '')
        
        if not ref_brand or not api_chain:
            return 0.0
        
        return fuzz.ratio(ref_brand, api_chain) / 100.0
    
    def _soundex_match(self, name1: str, name2: str) -> bool:
        if not name1 or not name2:
            return False
        
        try:
            soundex1 = phonetics.soundex(name1.replace(' ', ''))
            soundex2 = phonetics.soundex(name2.replace(' ', ''))
            return soundex1 == soundex2
        except:
            return False
    
    def _word_intersection_ratio(self, name1: str, name2: str) -> float:
        if not name1 or not name2:
            return 0.0
        
        words1 = set(name1.lower().split())
        words2 = set(name2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _premium_keywords_overlap(self, name1: str, name2: str) -> float:
        # Enhanced premium keywords based on API analysis
        premium_keywords = [
            'luxury', 'private', 'boutique', 'grand', 'deluxe', 'royal', 
            'retreat', 'premium', 'palace', 'collection', 'signature', 
            'exclusive', 'estate', 'reserve', 'imperial',
            # Geographic premium indicators
            'beach', 'oceanfront', 'seafront', 'waterfront', 'marina', 'bay'
        ]
        
        name1_lower = name1.lower()
        name2_lower = name2.lower()
        
        name1_keywords = sum(1 for kw in premium_keywords if kw in name1_lower)
        name2_keywords = sum(1 for kw in premium_keywords if kw in name2_lower)
        
        if name1_keywords == 0 and name2_keywords == 0:
            return 0.0
        
        common_keywords = sum(1 for kw in premium_keywords if kw in name1_lower and kw in name2_lower)
        total_keywords = max(name1_keywords, name2_keywords)
        
        return common_keywords / total_keywords if total_keywords > 0 else 0.0
    
    def _high_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['brand_exact'] and features['country_exact'] and features['city_similarity'] > 0.8:
            return {
                'match': True,
                'confidence': 0.95,
                'reason': 'Brand Perfect Match + Location Confirmed'
            }
        
        if features['fuzz_token_sort'] > 0.92 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.90,
                'reason': f'Name Almost Identical + Same Country ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['fuzz_token_set'] > 0.90 and features['country_exact'] and features['city_similarity'] > 0.7:
            return {
                'match': True,
                'confidence': 0.88,
                'reason': f'Same Words Different Order + Location ({features["fuzz_token_set"]:.2f})'
            }
        
        return None
    
    def _medium_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['brand_similarity'] > 0.8 and features['fuzz_token_sort'] > 0.75 and features['country_exact']:
            confidence = 0.70 + (features['fuzz_token_sort'] - 0.75) * 0.4
            return {
                'match': True,
                'confidence': min(confidence, 0.85),
                'reason': f'Brand Match + Strong Name Similarity ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['fuzz_partial'] > 0.85 and features['country_exact'] and (features['brand_similarity'] > 0.6 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Name Contains Match + Geographic + Indicators ({features["fuzz_partial"]:.2f})'
            }
        
        if features['fuzz_token_set'] > 0.80 and features['country_exact'] and features['city_similarity'] > 0.6:
            confidence = 0.65 + (features['fuzz_token_set'] - 0.80) * 0.5
            return {
                'match': True,
                'confidence': min(confidence, 0.82),
                'reason': f'Word Shuffle Match + Location Verified ({features["fuzz_token_set"]:.2f})'
            }
        
        return None
    
    def _lower_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['soundex_match'] and features['fuzz_partial'] > 0.75 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.60,
                'reason': f'Sounds Like + Partial Match + Geography ({features["fuzz_partial"]:.2f})'
            }
        
        if (features['word_intersection'] > 0.6 and features['premium_keywords'] > 0.4 and 
            features['country_exact'] and features['city_similarity'] > 0.5):
            return {
                'match': True,
                'confidence': 0.55,
                'reason': f'Common Words + Premium Indicators + Location ({features["word_intersection"]:.2f})'
            }
        
        return None
    
    def _make_name_only_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """
        Name-only matching decision for fallback cases
        More permissive rules when no geography available
        """
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Calculate basic features for name-only matching
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        # Name-only rules (more permissive thresholds)
        
        # Rule 1: Perfect brand match + decent name similarity
        if features['brand_exact'] and features['fuzz_token_sort'] > 0.70:
            return {
                'match': True,
                'confidence': 0.80,
                'reason': 'Brand Perfect Match + Name Similarity (Pure Name Matching)'
            }
        
        # Rule 2: Very high name similarity
        if features['fuzz_token_sort'] > 0.88:
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Names Almost Identical (Pure Name Matching) ({features["fuzz_token_sort"]:.2f})'
            }
        
        # Rule 3: High token set similarity
        if features['fuzz_token_set'] > 0.85:
            return {
                'match': True,
                'confidence': 0.70,
                'reason': f'Same Words Different Order (Pure Name Matching) ({features["fuzz_token_set"]:.2f})'
            }
        
        # Rule 4: Brand similarity + good name match
        if features['brand_similarity'] > 0.75 and features['fuzz_token_sort'] > 0.65:
            confidence = 0.60 + (features['fuzz_token_sort'] - 0.65) * 0.3
            return {
                'match': True,
                'confidence': min(confidence, 0.75),
                'reason': f'Brand Similar + Name Match (Pure Name Matching)'
            }
        
        # Rule 5: High partial ratio + premium indicators
        if features['fuzz_partial'] > 0.80 and (features['brand_similarity'] > 0.5 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.65,
                'reason': f'Name Contains Match + Premium Signals (Pure Name Matching)'
            }
        
        return None
    def _make_name_only_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """
        Name-only matching decision for fallback cases
        More permissive rules when no geography available
        """
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Calculate basic features for name-only matching
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        # Name-only rules (more permissive thresholds)
        
        # Rule 1: Perfect brand match + decent name similarity
        if features['brand_exact'] and features['fuzz_token_sort'] > 0.70:
            return {
                'match': True,
                'confidence': 0.80,
                'reason': 'Perfect brand + decent name (name-only)'
            }
        
        # Rule 2: Very high name similarity
        if features['fuzz_token_sort'] > 0.88:
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Very high token sort ({features["fuzz_token_sort"]:.2f}) (name-only)'
            }
        
        # Rule 3: High token set similarity
        if features['fuzz_token_set'] > 0.85:
            return {
                'match': True,
                'confidence': 0.70,
                'reason': f'High token set ({features["fuzz_token_set"]:.2f}) (name-only)'
            }
        
        # Rule 4: Brand similarity + good name match
        if features['brand_similarity'] > 0.75 and features['fuzz_token_sort'] > 0.65:
            confidence = 0.60 + (features['fuzz_token_sort'] - 0.65) * 0.3
            return {
                'match': True,
                'confidence': min(confidence, 0.75),
                'reason': f'Brand similarity + name match (name-only)'
            }
        
        # Rule 5: High partial ratio + premium indicators
        if features['fuzz_partial'] > 0.80 and (features['brand_similarity'] > 0.5 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.65,
                'reason': f'High partial + indicators (name-only)'
            }
        
        return None
    
    def _make_matching_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        features = self._calculate_features(ref_hotel, api_hotel)
        
        result = self._high_confidence_rules(features)
        if result:
            return result
        
        result = self._medium_confidence_rules(features)
        if result:
            return result
        
        result = self._lower_confidence_rules(features)
        if result:
            return result
        
        return None
    
    def run_matching(self) -> List[Dict]:
        print("\n" + "="*60)
        print("🎯 RUNNING HOTEL MATCHING")
        print("="*60)
        
        matches = []
        api_by_country = self.api_hotels.groupby('country_iso')
        total_hotels = len(self.reference_hotels)
        
        # Algorithm performance tracking with human-friendly names
        algorithm_stats = {
            'Brand Perfect Match + Location Confirmed': 0,
            'Name Almost Identical + Same Country': 0,
            'Same Words Different Order + Location': 0,
            'Brand Match + Strong Name Similarity': 0,
            'Name Contains Match + Geographic + Indicators': 0,
            'Word Shuffle Match + Location Verified': 0,
            'Sounds Like + Partial Match + Geography': 0,
            'Common Words + Premium Indicators + Location': 0,
            'Brand Perfect Match + Name Similarity (Pure Name Matching)': 0,
            'Names Almost Identical (Pure Name Matching)': 0,
            'Same Words Different Order (Pure Name Matching)': 0,
            'Brand Similar + Name Match (Pure Name Matching)': 0,
            'Name Contains Match + Premium Signals (Pure Name Matching)': 0
        }
        
        print(f"📊 Processing {total_hotels} reference hotels")
        print(f"🔧 ISO pre-filtering active - grouping API by country")
        print(f"🌍 API countries available: {len(api_by_country.groups)}")
        
        skipped_no_country = 0
        skipped_no_api_country = 0
        total_comparisons = 0
        
        for idx, (_, ref_hotel) in enumerate(self.reference_hotels.iterrows(), 1):
            ref_iso = ref_hotel['country_iso']
            ref_name = ref_hotel['clean_hotel']
            
            print(f"\n[{idx:2d}/{total_hotels}] Processing: {ref_name[:50]}...")
            print(f"    🌍 Country: {ref_iso}")
            
            best_match = None
            best_confidence = 0.0
            candidates_checked = 0
            
            # Strategy 1: ISO-based matching (preferred)
            if ref_iso and ref_iso in api_by_country.groups:
                candidates = api_by_country.get_group(ref_iso)
                print(f"    📍 ISO candidates: {len(candidates):,}")
                
                for _, api_hotel in candidates.iterrows():
                    candidates_checked += 1
                    total_comparisons += 1
                    
                    result = self._make_matching_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'ISO-based'
                        }
                    
                    if candidates_checked % 1000 == 0:
                        print(f"    ... checked {candidates_checked:,}/{len(candidates):,} candidates")
            
            # Strategy 2: Name-only fallback (when no ISO or no ISO matches)
            if not best_match or best_confidence < 0.75:
                if not ref_iso:
                    print(f"    ⚠️  No ISO mapping - trying name-only fallback")
                elif ref_iso not in api_by_country.groups:
                    print(f"    ⚠️  No API hotels for {ref_iso} - trying name-only fallback")
                else:
                    print(f"    🔄 Low confidence ({best_confidence:.3f}) - trying name-only fallback")
                
                fallback_candidates = 0
                fallback_checked = 0
                
                # Use top 5000 hotels by similarity for performance
                name_similarities = []
                for _, api_hotel in self.api_hotels.iterrows():
                    if api_hotel['normalized_name']:
                        sim = fuzz.ratio(ref_hotel['normalized_name'], api_hotel['normalized_name']) / 100.0
                        if sim > 0.4:  # Basic threshold
                            name_similarities.append((sim, api_hotel))
                
                # Sort by similarity and take top candidates
                name_similarities.sort(key=lambda x: x[0], reverse=True)
                top_candidates = name_similarities[:5000]  # Limit for performance
                fallback_candidates = len(top_candidates)
                
                print(f"    🔍 Name-only candidates: {fallback_candidates}")
                
                for sim_score, api_hotel in top_candidates:
                    fallback_checked += 1
                    total_comparisons += 1
                    
                    # Use name-only decision logic
                    result = self._make_name_only_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'Name-only fallback'
                        }
                
                if fallback_candidates == 0:
                    if not ref_iso:
                        skipped_no_country += 1
                    else:
                        skipped_no_api_country += 1
            
            # Record result
            if best_match and best_confidence >= 0.55:
                matches.append(best_match)
                
                # Track algorithm performance
                reason = best_match['match_reason']
                algorithm_found = False
                for algo_name in algorithm_stats.keys():
                    if algo_name in reason:
                        algorithm_stats[algo_name] += 1
                        algorithm_found = True
                        break
                
                if not algorithm_found:
                    # Check for partial matches in reason
                    if 'Pure Name Matching' in reason:
                        if 'Brand Perfect Match' in reason:
                            algorithm_stats['Brand Perfect Match + Name Similarity (Pure Name Matching)'] += 1
                        elif 'Names Almost Identical' in reason:
                            algorithm_stats['Names Almost Identical (Pure Name Matching)'] += 1
                        elif 'Same Words Different Order' in reason:
                            algorithm_stats['Same Words Different Order (Pure Name Matching)'] += 1
                        elif 'Brand Similar' in reason:
                            algorithm_stats['Brand Similar + Name Match (Pure Name Matching)'] += 1
                        elif 'Name Contains Match' in reason:
                            algorithm_stats['Name Contains Match + Premium Signals (Pure Name Matching)'] += 1
                
                confidence_level = "🟢 HIGH" if best_confidence >= 0.85 else "🟡 MEDIUM" if best_confidence >= 0.70 else "🔴 LOW"
                print(f"    ✅ MATCH FOUND: {best_match['api_name'][:40]}...")
                print(f"    📊 Confidence: {best_confidence:.3f} ({confidence_level})")
                print(f"    🎯 Strategy: {best_match['match_strategy']}")
                print(f"    📝 Reason: {best_match['match_reason']}")
            else:
                print(f"    ❌ No match found (best confidence: {best_confidence:.3f})")
        
        print(f"\n" + "="*60)
        print("🎯 MATCHING COMPLETED")
        print("="*60)
        print(f"✅ Total matches found: {len(matches)}")
        print(f"📊 Coverage: {len(matches)}/{total_hotels} ({len(matches)/total_hotels*100:.1f}%)")
        print(f"⚡ Performance stats:")
        print(f"   📊 Total comparisons: {total_comparisons:,}")
        print(f"   🚫 Skipped (no ISO): {skipped_no_country}")
        print(f"   🚫 Skipped (no API country): {skipped_no_api_country}")
        
        # Algorithm effectiveness breakdown
        print(f"\n🧠 ALGORITHM EFFECTIVENESS:")
        for algo_name, count in algorithm_stats.items():
            if count > 0:
                percentage = (count / len(matches)) * 100 if matches else 0
                print(f"   📈 {algo_name}: {count} matches ({percentage:.1f}%)")
        
        # Strategy breakdown
        if matches:
            iso_matches = sum(1 for m in matches if m['match_strategy'] == 'ISO-based')
            name_matches = sum(1 for m in matches if m['match_strategy'] == 'Name-only fallback')
            
            print(f"\n📊 STRATEGY BREAKDOWN:")
            print(f"   🌍 ISO-based matching: {iso_matches} ({iso_matches/len(matches)*100:.1f}%)")
            print(f"   🔍 Name-only fallback: {name_matches} ({name_matches/len(matches)*100:.1f}%)")
        
        if matches:
            confidences = [m['confidence'] for m in matches]
            high_conf = sum(1 for c in confidences if c >= 0.85)
            med_conf = sum(1 for c in confidences if 0.70 <= c < 0.85)
            low_conf = sum(1 for c in confidences if c < 0.70)
            
            print(f"\n📈 CONFIDENCE DISTRIBUTION:")
            print(f"   🟢 High (≥0.85): {high_conf}")
            print(f"   🟡 Medium (0.70-0.84): {med_conf}")
            print(f"   🔴 Low (<0.70): {low_conf}")
        
        return matches
    
    def create_full_reference_with_results(self, matches: List[Dict]) -> pd.DataFrame:
        results_df = self.reference_hotels[['Lokalizacja', 'Hotel', 'reference_id', 'country_raw', 'city', 'country_iso']].copy()
        
        # Rename columns for clarity
        results_df = results_df.rename(columns={
            'country_raw': 'reference_country',
            'city': 'reference_city'
        })
        
        match_dict = {match['reference_id']: match for match in matches}
        
        results_df['api_id'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_id', ''))
        results_df['api_name'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_name', ''))
        results_df['api_chain'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_chain', ''))
        results_df['api_city'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_city', ''))
        results_df['api_address'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_address', ''))
        results_df['api_latitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_latitude', ''))
        results_df['api_longitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_longitude', ''))
        results_df['confidence'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('confidence', ''))
        results_df['match_reason'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_reason', ''))
        results_df['match_strategy'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_strategy', ''))
        results_df['matched'] = results_df['api_id'] != ''
        
        return results_df
    
    def run_complete_matching(self, reference_csv: str, api_csv: str) -> Dict:
        print("🚀" + "="*58 + "🚀")
        print("🚀 HOTEL MAPPING ENGINE - ISO OPTIMIZED VERSION 🚀")
        print("🚀" + "="*58 + "🚀")
        print("Algorithm: ISO Pre-filtering + Rule-based Decision Tree")
        print("Features: Brand extraction + Geographic + String similarity")
        
        self.load_reference_hotels(reference_csv)
        if self.reference_hotels is None:
            return {'error': 'Failed to load reference hotels'}
            
        self.load_api_hotels(api_csv)
        if self.api_hotels is None:
            return {'error': 'Failed to load API hotels'}
        
        matches = self.run_matching()
        
        results = {
            'total_reference_hotels': len(self.reference_hotels),
            'total_api_hotels': len(self.api_hotels),
            'total_matches': len(matches),
            'coverage_percentage': len(matches) / len(self.reference_hotels) * 100,
            'matches': matches
        }
        
        return results
    
    def save_results(self, results: Dict, output_prefix: str = "hotel_mapping_iso"):
        print("\n" + "="*60)
        print("💾 SAVING RESULTS")
        print("="*60)
        
        matches = results['matches']
        
        if matches:
            matches_df = pd.DataFrame(matches)
            matches_file = f"{output_prefix}_matches.csv"
            matches_df.to_csv(matches_file, index=False)
            print(f"✅ Saved matches: {matches_file}")
        
        full_results = self.create_full_reference_with_results(matches)
        full_file = f"{output_prefix}_full_reference.csv"
        full_results.to_csv(full_file, index=False)
        print(f"✅ Saved full reference: {full_file}")
        
        summary = pd.DataFrame({
            'metric': ['Total Reference Hotels', 'Total API Hotels', 'Total Matches', 'Coverage Percentage'],
            'value': [results['total_reference_hotels'], results['total_api_hotels'], 
                     results['total_matches'], f"{results['coverage_percentage']:.1f}%"]
        })
        summary_file = f"{output_prefix}_summary.csv"
        summary.to_csv(summary_file, index=False)
        print(f"✅ Saved summary: {summary_file}")
        
        print(f"\n🎯 FINAL RESULTS:")
        print(f"   📚 Reference hotels: {results['total_reference_hotels']}")
        print(f"   📡 API hotels: {results['total_api_hotels']:,}")
        print(f"   ✅ Matches found: {results['total_matches']}")
        print(f"   📊 Coverage: {results['coverage_percentage']:.1f}%")


if __name__ == "__main__":
    matcher = ISOHotelMatcher()
    
    results = matcher.run_complete_matching(
        reference_csv="lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"
    )
    
    matcher.save_results(results)
    
    print("\n" + "="*60)
    print("🏁 HOTEL MAPPING COMPLETED!")
    print("="*60)

🚀 HOTEL MAPPING ENGINE - ISO OPTIMIZED VERSION 🚀
Algorithm: ISO Pre-filtering + Rule-based Decision Tree
Features: Brand extraction + Geographic + String similarity

📂 LOADING REFERENCE HOTELS
✅ Loaded file: lista_referencyjna.csv
📊 Found 99 reference hotels

🧹 Processing reference data...
🌍 Mapping countries to ISO codes...
⚠️  Unmapped countries: ['The Ritz-Carlton Yacht Collection']
📈 Countries distribution:
   GR (Greece): 13 hotels
   MV (Maldives): 12 hotels
   AE (United Arab Emirates): 11 hotels
   ES (Spain): 8 hotels
   TH (Thailand): 7 hotels
🏨 Luxury brands detected: 23
✅ Processed 99 reference hotels

📡 LOADING API HOTELS
✅ Loaded file: 01_api_rate_hawk.csv
📊 Found 95,463 API hotels

🧹 Processing API data...
🌍 API countries distribution:
   AE (United Arab Emirates): 12,942 hotels
   GR (Greece): 8,592 hotels
   TH (Thailand): 8,063 hotels
   US (United States): 7,858 hotels
   ES (Spain): 6,906 hotels
📍 Hotels with coordinates: 95,463/95,463 (100.0%)
🏨 Top 3 hotel chains:

In [19]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
import re
import phonetics
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class EnhancedHotelMatcher:
    def __init__(self):
        self.reference_hotels = None
        self.api_hotels = None
        self.iso_mappings = self._create_iso_mappings()
        self.country_to_iso = self._create_reverse_iso_lookup()
        self.luxury_brands = self._create_luxury_brands()
        self.api_id_brands = self._create_api_id_brands()
        
    def _create_iso_mappings(self) -> Dict[str, Dict]:
        return {
            'AE': {
                'iso_code': 'AE',
                'official_name': 'United Arab Emirates',
                'variants': ['UAE', 'U.A.E', 'United Arab Emirates', 'Emiraty Arabskie', 'Zjednoczone Emiraty Arabskie']
            },
            'TH': {
                'iso_code': 'TH',
                'official_name': 'Thailand', 
                'variants': ['Thailand', 'Tajlandia', 'Siam', 'Kingdom of Thailand']
            },
            'SC': {
                'iso_code': 'SC',
                'official_name': 'Seychelles',
                'variants': ['Seychelles', 'Seszele', 'Republic of Seychelles', 'Sesel']
            },
            'TR': {
                'iso_code': 'TR',
                'official_name': 'Turkey',
                'variants': ['Turkey', 'Turcja', 'Türkiye', 'Republic of Turkey']
            },
            'ES': {
                'iso_code': 'ES', 
                'official_name': 'Spain',
                'variants': ['Spain', 'Hiszpania', 'España', 'Kingdom of Spain']
            },
            'ME': {
                'iso_code': 'ME',
                'official_name': 'Montenegro',
                'variants': ['Montenegro', 'Czarnogóra', 'Crna Gora']
            },
            'IT': {
                'iso_code': 'IT',
                'official_name': 'Italy',
                'variants': ['Italy', 'Włochy', 'Italia', 'Italian Republic']
            },
            'MY': {
                'iso_code': 'MY',
                'official_name': 'Malaysia',
                'variants': ['Malaysia', 'Malezja']
            },
            'MV': {
                'iso_code': 'MV',
                'official_name': 'Maldives',
                'variants': ['Maldives', 'Malediwy', 'Republic of Maldives']
            },
            'GR': {
                'iso_code': 'GR',
                'official_name': 'Greece',
                'variants': ['Greece', 'Grecja', 'Hellas', 'Hellenic Republic']
            },
            'MA': {
                'iso_code': 'MA',
                'official_name': 'Morocco',
                'variants': ['Morocco', 'Maroko', 'Kingdom of Morocco']
            },
            'EG': {
                'iso_code': 'EG',
                'official_name': 'Egypt',
                'variants': ['Egypt', 'Egipt', 'Arab Republic of Egypt']
            },
            'MU': {
                'iso_code': 'MU',
                'official_name': 'Mauritius',
                'variants': ['Mauritius', 'Republic of Mauritius']
            },
            'GB': {
                'iso_code': 'GB',
                'official_name': 'United Kingdom',
                'variants': ['United Kingdom', 'Wielka Brytania', 'UK', 'Great Britain']
            },
            'PT': {
                'iso_code': 'PT',
                'official_name': 'Portugal',
                'variants': ['Portugal', 'Portugalia', 'Portuguese Republic']
            },
            'CH': {
                'iso_code': 'CH',
                'official_name': 'Switzerland',
                'variants': ['Switzerland', 'Szwajcaria', 'Swiss Confederation']
            },
            'AU': {
                'iso_code': 'AU',
                'official_name': 'Australia',
                'variants': ['Australia', 'Commonwealth of Australia']
            },
            'ID': {
                'iso_code': 'ID',
                'official_name': 'Indonesia',
                'variants': ['Indonesia', 'Indonezja', 'Republic of Indonesia']
            },
            'AW': {
                'iso_code': 'AW',
                'official_name': 'Aruba',
                'variants': ['Aruba']
            },
            'GL': {
                'iso_code': 'GL',
                'official_name': 'Greenland',
                'variants': ['Greenland', 'Grenlandia']
            },
            'VN': {
                'iso_code': 'VN',
                'official_name': 'Vietnam',
                'variants': ['Vietnam', 'Wietnam', 'Socialist Republic of Vietnam']
            },
            'FR': {
                'iso_code': 'FR',
                'official_name': 'France',
                'variants': ['France', 'Francja', 'French Republic']
            },
            'US': {
                'iso_code': 'US',
                'official_name': 'United States',
                'variants': ['United States', 'USA', 'US', 'America']
            },
            'SG': {
                'iso_code': 'SG',
                'official_name': 'Singapore',
                'variants': ['Singapore', 'Singapur', 'Republic of Singapore']
            },
            'IS': {
                'iso_code': 'IS',
                'official_name': 'Iceland',
                'variants': ['Iceland', 'Islandia', 'Republic of Iceland']
            },
            'QA': {
                'iso_code': 'QA',
                'official_name': 'Qatar',
                'variants': ['Qatar', 'State of Qatar']
            },
            'ZA': {
                'iso_code': 'ZA',
                'official_name': 'South Africa',
                'variants': ['South Africa', 'RPA', 'Republic of South Africa']
            },
            'DO': {
                'iso_code': 'DO',
                'official_name': 'Dominican Republic',
                'variants': ['Dominican Republic', 'Dominikana']
            },
            'CL': {
                'iso_code': 'CL',
                'official_name': 'Chile',
                'variants': ['Chile', 'Republic of Chile', 'Explora']
            }
        }
    
    def _create_reverse_iso_lookup(self) -> Dict[str, str]:
        lookup = {}
        for iso_code, data in self.iso_mappings.items():
            for variant in data['variants']:
                lookup[variant.lower().strip()] = iso_code
        return lookup
    
    def _create_luxury_brands(self) -> List[str]:
        return [
            'four seasons', 'atlantis', 'banyan tree', 'mandarin oriental',
            'one only', 'angsana', 'shangri-la', 'ritz carlton', 'ritz-carlton',
            'st regis', 'waldorf astoria', 'hilton', 'marriott', 'hyatt',
            'intercontinental', 'sheraton', 'westin', 'doubletree', 'regent',
            'fairmont', 'raffles', 'rosewood', 'belmond', 'six senses',
            'chedi', 'dusit thani', 'movenpick', 'mövenpick'
        ]
    
    def _create_api_id_brands(self) -> Dict[str, str]:
        """Brand patterns found in API IDs"""
        return {
            'four_seasons': 'Four Seasons',
            'oneonly': 'One & Only',
            'one_only': 'One & Only',
            'mandarin_oriental': 'Mandarin Oriental',
            'waldorf_astoria': 'Waldorf Astoria',
            'ritz_carlton': 'Ritz Carlton',
            'six_senses': 'Six Senses',
            'banyan_tree': 'Banyan Tree',
            'shangri': 'Shangri-La',
            'atlantis': 'Atlantis',
            'raffles': 'Raffles',
            'joali': 'JOALI',
            'angsana': 'Angsana',
            'hilton': 'Hilton',
            'marriott': 'Marriott',
            'hyatt': 'Hyatt'
        }
    
    def _map_country_to_iso(self, country_str: str) -> Optional[str]:
        if not country_str:
            return None
        country_clean = country_str.lower().strip()
        return self.country_to_iso.get(country_clean)
    
    def _extract_brand(self, hotel_name: str) -> Optional[str]:
        if not hotel_name:
            return None
        name_lower = hotel_name.lower()
        for brand in self.luxury_brands:
            if brand in name_lower:
                return brand
        return None
    
    def _extract_brand_from_api_id(self, api_id: str) -> Optional[str]:
        """Extract brand from API ID - more reliable than name parsing"""
        if not api_id:
            return None
        api_lower = api_id.lower()
        for pattern, brand in self.api_id_brands.items():
            if pattern in api_lower:
                return brand
        return None
    
    def _normalize_hotel_name(self, name: str) -> str:
        if pd.isna(name) or not name:
            return ""
        
        name = str(name).lower().strip()
        name = re.sub(r'[^\w\s]', ' ', name)
        
        # Enhanced stop words based on API analysis
        stop_words = [
            'hotel', 'resort', 'spa', 'suites', 'inn', 'lodge', 'motel',
            'apartment', 'apartments', 'villa', 'house', 'studio', 'bedroom',
            'luxury', 'grand', 'royal', 'palace', 'club', 'boutique', 'deluxe',
            'beach', 'view', 'pool', 'marina', 'bay', 'island',
            'in', 'by', 'with', 'and', 'the', 'at', 'of', 'for',
            '1', '2', '3', '4', '5', 'one', 'two', 'three', 'four', 'five'
        ]
        
        # Convert numbers to words for better matching
        number_map = {'1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five'}
        for digit, word in number_map.items():
            name = name.replace(f' {digit} ', f' {word} ')
        
        words = [word for word in name.split() if word not in stop_words and len(word) > 1]
        
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        
        return ' '.join(unique_words)
    
    def _normalize_city_name(self, city: str) -> str:
        if not city:
            return ""
        city_mappings = {
            'dubaj': 'dubai',
            'krabi': 'krabi',
            'desroches': 'desroches island'
        }
        city_clean = city.lower().strip()
        return city_mappings.get(city_clean, city_clean)
    
    def load_reference_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📂 LOADING REFERENCE HOTELS")
        print("="*60)
        
        df = pd.read_csv(csv_path)
        print(f"✅ Loaded file: {csv_path}")
        print(f"📊 Found {len(df)} reference hotels")
        
        print("\n🧹 Processing reference data...")
        df['clean_location'] = df['Lokalizacja'].str.strip()
        df['clean_hotel'] = df['Hotel'].str.strip()
        
        df[['country_raw', 'city_raw']] = df['clean_location'].str.split(',', n=1, expand=True)
        df['country_raw'] = df['country_raw'].fillna('').str.strip()
        df['city_raw'] = df['city_raw'].fillna('').str.strip()
        
        print("🌍 Mapping countries to ISO codes...")
        df['country_iso'] = df['country_raw'].apply(self._map_country_to_iso)
        
        unmapped_countries = df[df['country_iso'].isna()]['country_raw'].unique()
        if len(unmapped_countries) > 0:
            print(f"⚠️  Unmapped countries: {list(unmapped_countries)}")
        
        df['city'] = df['city_raw'].apply(self._normalize_city_name)
        df['normalized_name'] = df['clean_hotel'].apply(self._normalize_hotel_name)
        df['brand'] = df['clean_hotel'].apply(self._extract_brand)
        df['reference_id'] = df.index.astype(str).str.zfill(3)
        
        country_stats = df['country_iso'].value_counts()
        print(f"📈 Countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count} hotels")
        
        brands_found = df['brand'].dropna().nunique()
        print(f"🏨 Luxury brands detected: {brands_found}")
        
        self.reference_hotels = df
        print(f"✅ Processed {len(df)} reference hotels")
        return df
    
    def load_api_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📡 LOADING API HOTELS")
        print("="*60)
        
        df = pd.read_csv(csv_path)
        print(f"✅ Loaded file: {csv_path}")
        print(f"📊 Found {len(df):,} API hotels")
        
        print("\n🧹 Processing API data...")
        df['clean_name'] = df['name'].fillna('').str.strip()
        df['clean_city'] = df['city'].fillna('').str.strip()
        df['clean_address'] = df['address'].fillna('').str.strip()
        df['clean_chain'] = df['hotel_chain'].fillna('').str.strip()
        
        df['country_iso'] = df['country'].fillna('').str.upper().str.strip()
        df['normalized_name'] = df['clean_name'].apply(self._normalize_hotel_name)
        df['city_normalized'] = df['clean_city'].apply(self._normalize_city_name)
        
        df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['lng'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        country_stats = df['country_iso'].value_counts()
        print(f"🌍 API countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count:,} hotels")
        
        coords_valid = df[['lat', 'lng']].notna().all(axis=1).sum()
        print(f"📍 Hotels with coordinates: {coords_valid:,}/{len(df):,} ({coords_valid/len(df)*100:.1f}%)")
        
        chain_stats = df[df['clean_chain'] != '']['clean_chain'].value_counts()
        print(f"🏨 Top 3 hotel chains:")
        for chain, count in chain_stats.head(3).items():
            print(f"   {chain}: {count:,} hotels")
        
        self.api_hotels = df
        print(f"✅ Processed {len(df):,} API hotels")
        return df
    
    def _calculate_features(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return self._empty_features()
        
        features = {
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            'country_exact': ref_hotel['country_iso'] == api_hotel['country_iso'],
            'city_similarity': self._city_similarity(ref_hotel['city'], api_hotel['city_normalized']),
            'brand_exact': self._brand_exact_match(ref_hotel, api_hotel),
            'brand_similarity': self._brand_similarity(ref_hotel, api_hotel),
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        # API ID enhancement
        api_brand = self._extract_brand_from_api_id(api_hotel.get('id', ''))
        ref_brand = ref_hotel.get('brand')
        features['api_id_brand_match'] = api_brand and ref_brand and self._brands_match(ref_brand, api_brand)
        
        return features
    
    def _empty_features(self) -> Dict:
        return {
            'fuzz_ratio': 0.0, 'fuzz_partial': 0.0, 'fuzz_token_sort': 0.0,
            'fuzz_token_set': 0.0, 'country_exact': False, 'city_similarity': 0.0,
            'brand_exact': False, 'brand_similarity': 0.0, 'soundex_match': False,
            'word_intersection': 0.0, 'premium_keywords': 0.0, 'api_id_brand_match': False
        }
    
    def _city_similarity(self, city1: str, city2: str) -> float:
        if not city1 or not city2:
            return 0.0
        return fuzz.ratio(city1.lower(), city2.lower()) / 100.0
    
    def _brand_exact_match(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> bool:
        ref_brand = ref_hotel.get('brand')
        api_chain = api_hotel.get('clean_chain', '').lower()
        
        if not ref_brand or not api_chain:
            return False
        
        return ref_brand.lower() in api_chain or api_chain in ref_brand.lower()
    
    def _brand_similarity(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> float:
        ref_brand = ref_hotel.get('brand', '')
        api_chain = api_hotel.get('clean_chain', '')
        
        if not ref_brand or not api_chain:
            return 0.0
        
        return fuzz.ratio(ref_brand, api_chain) / 100.0
    
    def _brands_match(self, ref_brand: str, api_brand: str) -> bool:
        """Conservative brand matching for API ID"""
        if not ref_brand or not api_brand:
            return False
        similarity = fuzz.ratio(ref_brand.lower(), api_brand.lower()) / 100.0
        return similarity > 0.85
    
    def _soundex_match(self, name1: str, name2: str) -> bool:
        if not name1 or not name2:
            return False
        try:
            soundex1 = phonetics.soundex(name1.replace(' ', ''))
            soundex2 = phonetics.soundex(name2.replace(' ', ''))
            return soundex1 == soundex2
        except:
            return False
    
    def _word_intersection_ratio(self, name1: str, name2: str) -> float:
        if not name1 or not name2:
            return 0.0
        
        words1 = set(name1.lower().split())
        words2 = set(name2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _premium_keywords_overlap(self, name1: str, name2: str) -> float:
        premium_keywords = [
            'luxury', 'private', 'boutique', 'grand', 'deluxe', 'royal', 
            'retreat', 'premium', 'palace', 'collection', 'signature', 
            'exclusive', 'estate', 'reserve', 'imperial',
            'beach', 'oceanfront', 'seafront', 'waterfront', 'marina', 'bay'
        ]
        
        name1_lower = name1.lower()
        name2_lower = name2.lower()
        
        name1_keywords = sum(1 for kw in premium_keywords if kw in name1_lower)
        name2_keywords = sum(1 for kw in premium_keywords if kw in name2_lower)
        
        if name1_keywords == 0 and name2_keywords == 0:
            return 0.0
        
        common_keywords = sum(1 for kw in premium_keywords if kw in name1_lower and kw in name2_lower)
        total_keywords = max(name1_keywords, name2_keywords)
        
        return common_keywords / total_keywords if total_keywords > 0 else 0.0
    
    def _high_confidence_rules(self, features: Dict) -> Optional[Dict]:
        # API ID Brand Match gets highest priority
        if features['api_id_brand_match'] and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.97,
                'reason': 'API ID Brand Perfect Match + Same Country'
            }
        
        if features['brand_exact'] and features['country_exact'] and features['city_similarity'] > 0.8:
            return {
                'match': True,
                'confidence': 0.95,
                'reason': 'Brand Perfect Match + Location Confirmed'
            }
        
        if features['fuzz_token_sort'] > 0.92 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.90,
                'reason': f'Name Almost Identical + Same Country ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['fuzz_token_set'] > 0.90 and features['country_exact'] and features['city_similarity'] > 0.7:
            return {
                'match': True,
                'confidence': 0.88,
                'reason': f'Same Words Different Order + Location ({features["fuzz_token_set"]:.2f})'
            }
        
        return None
    
    def _medium_confidence_rules(self, features: Dict) -> Optional[Dict]:
        # API ID Brand Match with lower name similarity
        if features['api_id_brand_match'] and features['country_exact'] and features['fuzz_token_sort'] > 0.60:
            confidence = 0.80 + features['fuzz_token_sort'] * 0.1
            return {
                'match': True,
                'confidence': min(confidence, 0.95),
                'reason': f'API ID Brand Match + Name Similarity ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['brand_similarity'] > 0.8 and features['fuzz_token_sort'] > 0.75 and features['country_exact']:
            confidence = 0.70 + (features['fuzz_token_sort'] - 0.75) * 0.4
            return {
                'match': True,
                'confidence': min(confidence, 0.85),
                'reason': f'Brand Match + Strong Name Similarity ({features["fuzz_token_sort"]:.2f})'
            }
        
        if features['fuzz_partial'] > 0.85 and features['country_exact'] and (features['brand_similarity'] > 0.6 or features['premium_keywords'] > 0.3):
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Name Contains Match + Geographic + Indicators ({features["fuzz_partial"]:.2f})'
            }
        
        if features['fuzz_token_set'] > 0.80 and features['country_exact'] and features['city_similarity'] > 0.6:
            confidence = 0.65 + (features['fuzz_token_set'] - 0.80) * 0.5
            return {
                'match': True,
                'confidence': min(confidence, 0.82),
                'reason': f'Word Shuffle Match + Location Verified ({features["fuzz_token_set"]:.2f})'
            }
        
        return None
    
    def _lower_confidence_rules(self, features: Dict) -> Optional[Dict]:
        if features['soundex_match'] and features['fuzz_partial'] > 0.75 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.60,
                'reason': f'Sounds Like + Partial Match + Geography ({features["fuzz_partial"]:.2f})'
            }
        
        if (features['word_intersection'] > 0.6 and features['premium_keywords'] > 0.4 and 
            features['country_exact'] and features['city_similarity'] > 0.5):
            return {
                'match': True,
                'confidence': 0.55,
                'reason': f'Common Words + Premium Indicators + Location ({features["word_intersection"]:.2f})'
            }
        
        return None
    
    def _make_matching_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        features = self._calculate_features(ref_hotel, api_hotel)
        
        result = self._high_confidence_rules(features)
        if result:
            return result
        
        result = self._medium_confidence_rules(features)
        if result:
            return result
        
        result = self._lower_confidence_rules(features)
        if result:
            return result
        
        return None
    
    def _make_name_only_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Check for API ID brand match first
        api_brand = self._extract_brand_from_api_id(api_hotel.get('id', ''))
        ref_brand = ref_hotel.get('brand')
        api_id_brand_match = api_brand and ref_brand and self._brands_match(ref_brand, api_brand)
        
        if api_id_brand_match:
            name_sim = fuzz.token_sort_ratio(ref_name, api_name) / 100.0
            confidence = 0.75 + name_sim * 0.15
            return {
                'match': True,
                'confidence': min(confidence, 0.90),
                'reason': f'API ID Brand Match + Name Similarity (Pure Name Matching) ({name_sim:.2f})'
            }
        
        # Regular name-only matching
        fuzz_token_sort = fuzz.token_sort_ratio(ref_name, api_name) / 100.0
        fuzz_token_set = fuzz.token_set_ratio(ref_name, api_name) / 100.0
        
        if fuzz_token_sort > 0.88:
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Names Almost Identical (Pure Name Matching) ({fuzz_token_sort:.2f})'
            }
        
        if fuzz_token_set > 0.85:
            return {
                'match': True,
                'confidence': 0.70,
                'reason': f'Same Words Different Order (Pure Name Matching) ({fuzz_token_set:.2f})'
            }
        
        return None
    
    def run_matching(self) -> List[Dict]:
        print("\n" + "="*60)
        print("🎯 RUNNING ENHANCED HOTEL MATCHING")
        print("="*60)
        
        matches = []
        api_by_country = self.api_hotels.groupby('country_iso')
        total_hotels = len(self.reference_hotels)
        
        algorithm_stats = {
            'API ID Brand Perfect Match + Same Country': 0,
            'Brand Perfect Match + Location Confirmed': 0,
            'Name Almost Identical + Same Country': 0,
            'Same Words Different Order + Location': 0,
            'API ID Brand Match + Name Similarity': 0,
            'Brand Match + Strong Name Similarity': 0,
            'Name Contains Match + Geographic + Indicators': 0,
            'Word Shuffle Match + Location Verified': 0,
            'Sounds Like + Partial Match + Geography': 0,
            'Common Words + Premium Indicators + Location': 0,
            'API ID Brand Match + Name Similarity (Pure Name Matching)': 0,
            'Names Almost Identical (Pure Name Matching)': 0,
            'Same Words Different Order (Pure Name Matching)': 0
        }
        
        print(f"📊 Processing {total_hotels} reference hotels")
        print(f"🔧 ISO pre-filtering + API ID intelligence active")
        print(f"🌍 API countries available: {len(api_by_country.groups)}")
        
        total_comparisons = 0
        skipped_no_country = 0
        skipped_no_api_country = 0
        api_id_rescues = 0
        
        for idx, (_, ref_hotel) in enumerate(self.reference_hotels.iterrows(), 1):
            ref_iso = ref_hotel['country_iso']
            ref_name = ref_hotel['clean_hotel']
            
            print(f"\n[{idx:2d}/{total_hotels}] Processing: {ref_name[:50]}...")
            print(f"    🌍 Country: {ref_iso}")
            
            best_match = None
            best_confidence = 0.0
            
            # Strategy 1: ISO-based matching
            if ref_iso and ref_iso in api_by_country.groups:
                candidates = api_by_country.get_group(ref_iso)
                print(f"    📍 ISO candidates: {len(candidates):,}")
                
                candidates_checked = 0
                for _, api_hotel in candidates.iterrows():
                    candidates_checked += 1
                    total_comparisons += 1
                    
                    result = self._make_matching_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'ISO-based'
                        }
                    
                    if candidates_checked % 1000 == 0:
                        print(f"    ... checked {candidates_checked:,}/{len(candidates):,} candidates")
            
            # Strategy 2: Name-only fallback
            if not best_match or best_confidence < 0.75:
                if not ref_iso:
                    print(f"    ⚠️  No ISO mapping - trying name-only fallback")
                elif ref_iso not in api_by_country.groups:
                    print(f"    ⚠️  No API hotels for {ref_iso} - trying name-only fallback")
                else:
                    print(f"    🔄 Low confidence ({best_confidence:.3f}) - trying name-only fallback")
                
                # Use top 5000 hotels by similarity
                name_similarities = []
                for _, api_hotel in self.api_hotels.iterrows():
                    if api_hotel['normalized_name']:
                        sim = fuzz.ratio(ref_hotel['normalized_name'], api_hotel['normalized_name']) / 100.0
                        if sim > 0.4:
                            name_similarities.append((sim, api_hotel))
                
                name_similarities.sort(key=lambda x: x[0], reverse=True)
                top_candidates = name_similarities[:5000]
                
                print(f"    🔍 Name-only candidates: {len(top_candidates)}")
                
                for sim_score, api_hotel in top_candidates:
                    total_comparisons += 1
                    
                    result = self._make_name_only_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel['id'],
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'Name-only fallback'
                        }
                
                if len(top_candidates) == 0:
                    if not ref_iso:
                        skipped_no_country += 1
                    else:
                        skipped_no_api_country += 1
            
            # Strategy 3: API ID Rescue (only for strong brands)
            if not best_match:
                ref_brand = self._extract_brand(ref_hotel['clean_hotel'])
                if ref_brand and ref_iso and ref_iso in api_by_country.groups:
                    print(f"    🚑 Trying API ID rescue for brand: {ref_brand}")
                    rescue_candidates = api_by_country.get_group(ref_iso)
                    
                    for _, api_hotel in rescue_candidates.iterrows():
                        api_brand = self._extract_brand_from_api_id(api_hotel.get('id', ''))
                        
                        if api_brand and self._brands_match(ref_brand, api_brand):
                            # Calculate rescue confidence
                            name_sim = fuzz.token_sort_ratio(ref_hotel['normalized_name'], 
                                                            api_hotel['normalized_name']) / 100.0
                            rescue_confidence = 0.70 + name_sim * 0.15
                            
                            if rescue_confidence >= 0.60:
                                best_match = {
                                    'reference_id': ref_hotel['reference_id'],
                                    'reference_name': ref_hotel['clean_hotel'],
                                    'api_id': api_hotel['id'],
                                    'api_name': api_hotel['clean_name'],
                                    'api_chain': api_hotel['clean_chain'],
                                    'api_city': api_hotel['clean_city'],
                                    'api_address': api_hotel['clean_address'],
                                    'api_latitude': api_hotel.get('lat'),
                                    'api_longitude': api_hotel.get('lng'),
                                    'confidence': rescue_confidence,
                                    'match_reason': f'API ID Brand Rescue: {api_brand} match',
                                    'match_strategy': 'API ID Rescue'
                                }
                                api_id_rescues += 1
                                print(f"    🚑 RESCUE SUCCESS: {api_hotel['clean_name'][:40]}...")
                                break
            
            # Record result
            if best_match and best_confidence >= 0.55:
                matches.append(best_match)
                
                # Track algorithm performance
                reason = best_match['match_reason']
                for algo_name in algorithm_stats.keys():
                    if algo_name in reason:
                        algorithm_stats[algo_name] += 1
                        break
                
                confidence_level = "🟢 HIGH" if best_confidence >= 0.85 else "🟡 MEDIUM" if best_confidence >= 0.70 else "🔴 LOW"
                print(f"    ✅ MATCH FOUND: {best_match['api_name'][:40]}...")
                print(f"    📊 Confidence: {best_confidence:.3f} ({confidence_level})")
                print(f"    🎯 Strategy: {best_match['match_strategy']}")
                print(f"    📝 Reason: {best_match['match_reason'][:60]}...")
            else:
                print(f"    ❌ No match found (best confidence: {best_confidence:.3f})")
        
        print(f"\n" + "="*60)
        print("🎯 ENHANCED MATCHING COMPLETED")
        print("="*60)
        print(f"✅ Total matches found: {len(matches)}")
        print(f"📊 Coverage: {len(matches)}/{total_hotels} ({len(matches)/total_hotels*100:.1f}%)")
        print(f"⚡ Performance stats:")
        print(f"   📊 Total comparisons: {total_comparisons:,}")
        print(f"   🚫 Skipped (no ISO): {skipped_no_country}")
        print(f"   🚫 Skipped (no API country): {skipped_no_api_country}")
        print(f"   🚑 API ID rescues: {api_id_rescues}")
        
        print(f"\n🧠 ALGORITHM EFFECTIVENESS:")
        for algo_name, count in algorithm_stats.items():
            if count > 0:
                percentage = (count / len(matches)) * 100 if matches else 0
                print(f"   📈 {algo_name}: {count} matches ({percentage:.1f}%)")
        
        # Strategy breakdown
        if matches:
            iso_matches = sum(1 for m in matches if m['match_strategy'] == 'ISO-based')
            name_matches = sum(1 for m in matches if m['match_strategy'] == 'Name-only fallback')
            rescue_matches = sum(1 for m in matches if m['match_strategy'] == 'API ID Rescue')
            
            print(f"\n📊 STRATEGY BREAKDOWN:")
            print(f"   🌍 ISO-based matching: {iso_matches} ({iso_matches/len(matches)*100:.1f}%)")
            print(f"   🔍 Name-only fallback: {name_matches} ({name_matches/len(matches)*100:.1f}%)")
            if rescue_matches > 0:
                print(f"   🚑 API ID Rescue: {rescue_matches} ({rescue_matches/len(matches)*100:.1f}%)")
        
        if matches:
            confidences = [m['confidence'] for m in matches]
            high_conf = sum(1 for c in confidences if c >= 0.85)
            med_conf = sum(1 for c in confidences if 0.70 <= c < 0.85)
            low_conf = sum(1 for c in confidences if c < 0.70)
            
            print(f"\n📈 CONFIDENCE DISTRIBUTION:")
            print(f"   🟢 High (≥0.85): {high_conf}")
            print(f"   🟡 Medium (0.70-0.84): {med_conf}")
            print(f"   🔴 Low (<0.70): {low_conf}")
        
        return matches
    
    def create_full_reference_with_results(self, matches: List[Dict]) -> pd.DataFrame:
        results_df = self.reference_hotels[['Lokalizacja', 'Hotel', 'reference_id', 'country_raw', 'city', 'country_iso']].copy()
        
        results_df = results_df.rename(columns={
            'country_raw': 'reference_country',
            'city': 'reference_city'
        })
        
        match_dict = {match['reference_id']: match for match in matches}
        
        results_df['api_id'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_id', ''))
        results_df['api_name'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_name', ''))
        results_df['api_chain'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_chain', ''))
        results_df['api_city'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_city', ''))
        results_df['api_address'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_address', ''))
        results_df['api_latitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_latitude', ''))
        results_df['api_longitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_longitude', ''))
        results_df['confidence'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('confidence', ''))
        results_df['match_reason'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_reason', ''))
        results_df['match_strategy'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_strategy', ''))
        results_df['matched'] = results_df['api_id'] != ''
        
        return results_df
    
    def run_complete_matching(self, reference_csv: str, api_csv: str) -> Dict:
        print("🚀" + "="*58 + "🚀")
        print("🚀 ENHANCED HOTEL MAPPING ENGINE - API ID INTELLIGENCE 🚀")
        print("🚀" + "="*58 + "🚀")
        print("Algorithm: ISO Pre-filtering + Rule-based Decision Tree + API ID Intelligence")
        print("Features: Brand extraction + Geographic + String similarity + API ID rescue")
        
        self.load_reference_hotels(reference_csv)
        if self.reference_hotels is None:
            return {'error': 'Failed to load reference hotels'}
            
        self.load_api_hotels(api_csv)
        if self.api_hotels is None:
            return {'error': 'Failed to load API hotels'}
        
        matches = self.run_matching()
        
        results = {
            'total_reference_hotels': len(self.reference_hotels),
            'total_api_hotels': len(self.api_hotels),
            'total_matches': len(matches),
            'coverage_percentage': len(matches) / len(self.reference_hotels) * 100,
            'matches': matches
        }
        
        return results
    
    def save_results(self, results: Dict, output_prefix: str = "enhanced_hotel_mapping"):
        print("\n" + "="*60)
        print("💾 SAVING ENHANCED RESULTS")
        print("="*60)
        
        matches = results['matches']
        
        if matches:
            matches_df = pd.DataFrame(matches)
            matches_file = f"{output_prefix}_matches.csv"
            matches_df.to_csv(matches_file, index=False, encoding='utf-8-sig')
            print(f"✅ Saved matches: {matches_file}")
        
        full_results = self.create_full_reference_with_results(matches)
        full_file = f"{output_prefix}_full_reference.csv"
        full_results.to_csv(full_file, index=False, encoding='utf-8-sig')
        print(f"✅ Saved full reference: {full_file}")
        
        summary = pd.DataFrame({
            'metric': ['Total Reference Hotels', 'Total API Hotels', 'Total Matches', 'Coverage Percentage'],
            'value': [results['total_reference_hotels'], results['total_api_hotels'], 
                     results['total_matches'], f"{results['coverage_percentage']:.1f}%"]
        })
        summary_file = f"{output_prefix}_summary.csv"
        summary.to_csv(summary_file, index=False, encoding='utf-8-sig')
        print(f"✅ Saved summary: {summary_file}")
        
        print(f"\n🎯 ENHANCED FINAL RESULTS:")
        print(f"   📚 Reference hotels: {results['total_reference_hotels']}")
        print(f"   📡 API hotels: {results['total_api_hotels']:,}")
        print(f"   ✅ Matches found: {results['total_matches']}")
        print(f"   📊 Coverage: {results['coverage_percentage']:.1f}%")


if __name__ == "__main__":
    matcher = EnhancedHotelMatcher()
    
    results = matcher.run_complete_matching(
        reference_csv="00_api_lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"
    )
    
    matcher.save_results(results)
    
    print("\n" + "="*60)
    print("🏁 ENHANCED HOTEL MAPPING COMPLETED!")
    print("="*60)

🚀 ENHANCED HOTEL MAPPING ENGINE - API ID INTELLIGENCE 🚀
Algorithm: ISO Pre-filtering + Rule-based Decision Tree + API ID Intelligence
Features: Brand extraction + Geographic + String similarity + API ID rescue

📂 LOADING REFERENCE HOTELS
✅ Loaded file: 00_api_lista_referencyjna.csv
📊 Found 97 reference hotels

🧹 Processing reference data...
🌍 Mapping countries to ISO codes...
📈 Countries distribution:
   GR (Greece): 13 hotels
   MV (Maldives): 12 hotels
   AE (United Arab Emirates): 11 hotels
   ES (Spain): 8 hotels
   TH (Thailand): 7 hotels
🏨 Luxury brands detected: 23
✅ Processed 97 reference hotels

📡 LOADING API HOTELS
✅ Loaded file: 01_api_rate_hawk.csv
📊 Found 95,463 API hotels

🧹 Processing API data...
🌍 API countries distribution:
   AE (United Arab Emirates): 12,942 hotels
   GR (Greece): 8,592 hotels
   TH (Thailand): 8,063 hotels
   US (United States): 7,858 hotels
   ES (Spain): 6,906 hotels
📍 Hotels with coordinates: 95,463/95,463 (100.0%)
🏨 Top 3 hotel chains:
   No cha

In [22]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
import re
import phonetics
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class ProductionHotelMatcher:
    def __init__(self, api_source: str = 'universal'):
        self.reference_hotels = None
        self.api_hotels = None
        self.api_source = api_source
        
        # Universal components - work with ANY API
        self.iso_mappings = self._create_iso_mappings()
        self.country_to_iso = self._create_reverse_iso_lookup()
        self.universal_brands = self._create_universal_brands()
        self.stop_words = self._create_enhanced_stop_words()
        self.premium_keywords = self._create_premium_keywords()
        
        # API-specific enhancements (optional)
        self.api_enhancements = self._load_api_enhancements(api_source)
        
    def _create_iso_mappings(self) -> Dict[str, Dict]:
        """Universal ISO country mappings - works with all APIs"""
        return {
            'AE': {
                'iso_code': 'AE',
                'official_name': 'United Arab Emirates',
                'variants': ['UAE', 'U.A.E', 'United Arab Emirates', 'Emiraty Arabskie', 'Zjednoczone Emiraty Arabskie']
            },
            'TH': {
                'iso_code': 'TH',
                'official_name': 'Thailand', 
                'variants': ['Thailand', 'Tajlandia', 'Siam', 'Kingdom of Thailand']
            },
            'SC': {
                'iso_code': 'SC',
                'official_name': 'Seychelles',
                'variants': ['Seychelles', 'Seszele', 'Republic of Seychelles', 'Sesel']
            },
            'TR': {
                'iso_code': 'TR',
                'official_name': 'Turkey',
                'variants': ['Turkey', 'Turcja', 'Türkiye', 'Republic of Turkey']
            },
            'ES': {
                'iso_code': 'ES', 
                'official_name': 'Spain',
                'variants': ['Spain', 'Hiszpania', 'España', 'Kingdom of Spain']
            },
            'ME': {
                'iso_code': 'ME',
                'official_name': 'Montenegro',
                'variants': ['Montenegro', 'Czarnogóra', 'Crna Gora']
            },
            'IT': {
                'iso_code': 'IT',
                'official_name': 'Italy',
                'variants': ['Italy', 'Włochy', 'Italia', 'Italian Republic']
            },
            'MY': {
                'iso_code': 'MY',
                'official_name': 'Malaysia',
                'variants': ['Malaysia', 'Malezja']
            },
            'MV': {
                'iso_code': 'MV',
                'official_name': 'Maldives',
                'variants': ['Maldives', 'Malediwy', 'Republic of Maldives']
            },
            'GR': {
                'iso_code': 'GR',
                'official_name': 'Greece',
                'variants': ['Greece', 'Grecja', 'Hellas', 'Hellenic Republic']
            },
            'MA': {
                'iso_code': 'MA',
                'official_name': 'Morocco',
                'variants': ['Morocco', 'Maroko', 'Kingdom of Morocco']
            },
            'EG': {
                'iso_code': 'EG',
                'official_name': 'Egypt',
                'variants': ['Egypt', 'Egipt', 'Arab Republic of Egypt']
            },
            'MU': {
                'iso_code': 'MU',
                'official_name': 'Mauritius',
                'variants': ['Mauritius', 'Republic of Mauritius']
            },
            'GB': {
                'iso_code': 'GB',
                'official_name': 'United Kingdom',
                'variants': ['United Kingdom', 'Wielka Brytania', 'UK', 'Great Britain']
            },
            'PT': {
                'iso_code': 'PT',
                'official_name': 'Portugal',
                'variants': ['Portugal', 'Portugalia', 'Portuguese Republic']
            },
            'CH': {
                'iso_code': 'CH',
                'official_name': 'Switzerland',
                'variants': ['Switzerland', 'Szwajcaria', 'Swiss Confederation']
            },
            'AU': {
                'iso_code': 'AU',
                'official_name': 'Australia',
                'variants': ['Australia', 'Commonwealth of Australia']
            },
            'ID': {
                'iso_code': 'ID',
                'official_name': 'Indonesia',
                'variants': ['Indonesia', 'Indonezja', 'Republic of Indonesia']
            },
            'AW': {
                'iso_code': 'AW',
                'official_name': 'Aruba',
                'variants': ['Aruba']
            },
            'GL': {
                'iso_code': 'GL',
                'official_name': 'Greenland',
                'variants': ['Greenland', 'Grenlandia']
            },
            'VN': {
                'iso_code': 'VN',
                'official_name': 'Vietnam',
                'variants': ['Vietnam', 'Wietnam', 'Socialist Republic of Vietnam']
            },
            'FR': {
                'iso_code': 'FR',
                'official_name': 'France',
                'variants': ['France', 'Francja', 'French Republic']
            },
            'US': {
                'iso_code': 'US',
                'official_name': 'United States',
                'variants': ['United States', 'USA', 'US', 'America']
            },
            'SG': {
                'iso_code': 'SG',
                'official_name': 'Singapore',
                'variants': ['Singapore', 'Singapur', 'Republic of Singapore']
            },
            'IS': {
                'iso_code': 'IS',
                'official_name': 'Iceland',
                'variants': ['Iceland', 'Islandia', 'Republic of Iceland']
            },
            'QA': {
                'iso_code': 'QA',
                'official_name': 'Qatar',
                'variants': ['Qatar', 'State of Qatar']
            },
            'ZA': {
                'iso_code': 'ZA',
                'official_name': 'South Africa',
                'variants': ['South Africa', 'RPA', 'Republic of South Africa']
            },
            'DO': {
                'iso_code': 'DO',
                'official_name': 'Dominican Republic',
                'variants': ['Dominican Republic', 'Dominikana']
            },
            'CL': {
                'iso_code': 'CL',
                'official_name': 'Chile',
                'variants': ['Chile', 'Republic of Chile', 'Explora']
            }
        }
    
    def _create_reverse_iso_lookup(self) -> Dict[str, str]:
        lookup = {}
        for iso_code, data in self.iso_mappings.items():
            for variant in data['variants']:
                lookup[variant.lower().strip()] = iso_code
        return lookup
    
    def _create_universal_brands(self) -> List[str]:
        """Universal luxury brands - work with all APIs"""
        return [
            'four seasons', 'atlantis', 'banyan tree', 'mandarin oriental',
            'one only', 'one & only', 'angsana', 'shangri-la', 'ritz carlton', 
            'ritz-carlton', 'st regis', 'waldorf astoria', 'hilton', 'marriott', 
            'hyatt', 'intercontinental', 'sheraton', 'westin', 'doubletree', 
            'regent', 'fairmont', 'raffles', 'rosewood', 'belmond', 'six senses',
            'chedi', 'dusit thani', 'movenpick', 'mövenpick', 'joali', 'conrad',
            'park hyatt', 'grand hyatt', 'andaz', 'aloft', 'w hotels', 'le meridien',
            'luxury collection', 'autograph collection', 'jw marriott'
        ]
    
    def _create_enhanced_stop_words(self) -> List[str]:
        """Enhanced stop words based on API analysis"""
        return [
            # Basic hotel types
            'hotel', 'resort', 'spa', 'suites', 'inn', 'lodge', 'motel',
            # Accommodation types
            'apartment', 'apartments', 'villa', 'house', 'studio', 'bedroom',
            # Descriptors
            'luxury', 'grand', 'royal', 'palace', 'club', 'boutique', 'deluxe',
            # Location words
            'beach', 'view', 'pool', 'marina', 'bay', 'island',
            # Common words
            'in', 'by', 'with', 'and', 'the', 'at', 'of', 'for',
            # Numbers
            '1', '2', '3', '4', '5', 'one', 'two', 'three', 'four', 'five'
        ]
    
    def _create_premium_keywords(self) -> List[str]:
        """Premium indicators that work across all APIs"""
        return [
            'luxury', 'private', 'boutique', 'grand', 'deluxe', 'royal', 
            'retreat', 'premium', 'palace', 'collection', 'signature', 
            'exclusive', 'estate', 'reserve', 'imperial',
            'beach', 'oceanfront', 'seafront', 'waterfront', 'marina', 'bay'
        ]
    
    def _extract_id_components_smart(self, api_id: str, ref_hotel: pd.Series) -> Dict:
        """
        Smart dynamic analysis of API ID against reference hotel data
        No hardcoded patterns needed - compares ID parts with reference data
        """
        if not api_id or len(str(api_id)) < 3:
            return {'confidence_boost': 0.0, 'matches': []}
        
        # Split ID into meaningful components
        id_parts = re.split(r'[_\-\s\.]+', str(api_id).lower())
        id_parts = [part.strip() for part in id_parts if len(part) > 2 and not part.isdigit()]
        
        if not id_parts:
            return {'confidence_boost': 0.0, 'matches': []}
        
        # Reference data for comparison
        ref_brand = ref_hotel.get('brand', '').lower() if ref_hotel.get('brand') else ''
        ref_city = ref_hotel.get('city', '').lower() if ref_hotel.get('city') else ''
        ref_country = ref_hotel.get('country_raw', '').lower() if ref_hotel.get('country_raw') else ''
        ref_name_words = ref_hotel.get('normalized_name', '').split() if ref_hotel.get('normalized_name') else []
        
        matches = []
        confidence_boost = 0.0
        
        # City variant mappings for international names
        city_variants = {
            'warsaw': ['warszawa', 'warsaw'],
            'prague': ['praga', 'praha', 'prague'], 
            'vienna': ['wiedeń', 'wien', 'vienna'],
            'munich': ['monachium', 'münchen', 'munich'],
            'florence': ['florencja', 'firenze', 'florence'],
            'rome': ['rzym', 'roma', 'rome'],
            'paris': ['paryż', 'paris'],
            'london': ['londyn', 'london'],
            'istanbul': ['stambul', 'istanbul'],
            'athens': ['ateny', 'athens'],
            'budapest': ['budapeszt', 'budapest'],
            'madrid': ['madryt', 'madrid'],
            'lisbon': ['lizbona', 'lisboa', 'lisbon'],
            'zurich': ['zurych', 'zurich'],
            'geneva': ['genewa', 'genève', 'geneva'],
            'dubai': ['dubaj', 'dubai'],
            'moscow': ['moskwa', 'moscow'],
            'beijing': ['pekin', 'beijing']
        }
        
        # Check each ID component against reference data
        for part in id_parts:
            if len(part) < 3:
                continue
                
            # Brand matching
            if ref_brand and len(ref_brand) > 2:
                brand_similarity = fuzz.ratio(part, ref_brand) / 100.0
                if brand_similarity > 0.85:
                    confidence_boost += 0.06
                    matches.append(f"Brand: '{part}' matches '{ref_brand}'")
                    continue
            
            # City matching (direct)
            if ref_city and len(ref_city) > 2:
                city_similarity = fuzz.ratio(part, ref_city) / 100.0
                if city_similarity > 0.80:
                    confidence_boost += 0.04
                    matches.append(f"City: '{part}' matches '{ref_city}'")
                    continue
            
            # City variant matching
            if ref_city:
                for canonical, variants in city_variants.items():
                    if part in variants and ref_city in variants:
                        confidence_boost += 0.04
                        matches.append(f"City variant: '{part}' ~ '{ref_city}'")
                        break
            
            # Hotel name word matching
            for ref_word in ref_name_words:
                if len(ref_word) > 3:
                    word_similarity = fuzz.ratio(part, ref_word) / 100.0
                    if word_similarity > 0.85:
                        confidence_boost += 0.02
                        matches.append(f"Name part: '{part}' matches '{ref_word}'")
                        break
            
            # Country matching (less common but useful)
            if ref_country and len(ref_country) > 3:
                country_similarity = fuzz.ratio(part, ref_country) / 100.0
                if country_similarity > 0.85:
                    confidence_boost += 0.03
                    matches.append(f"Country: '{part}' matches '{ref_country}'")
        
        # Cap the boost to prevent over-enhancement
        confidence_boost = min(confidence_boost, 0.15)
        
        return {
            'confidence_boost': confidence_boost,
            'matches': matches,
            'id_parts': id_parts
        }
    
    def _try_smart_api_enhancements(self, base_result: Dict, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        """
        Smart enhancement using dynamic ID analysis - no hardcoded patterns
        """
        if not base_result:
            return base_result
        
        api_id = api_hotel.get('id', '')
        if not api_id:
            return base_result
        
        # Perform smart analysis
        analysis = self._extract_id_components_smart(api_id, ref_hotel)
        
        if analysis['confidence_boost'] > 0.01:  # Only enhance if meaningful boost
            enhanced_result = base_result.copy()
            original_confidence = enhanced_result['confidence']
            
            enhanced_result['confidence'] = min(
                original_confidence + analysis['confidence_boost'], 
                0.98
            )
            
            # Add enhancement details to reason
            if analysis['matches']:
                enhancement_details = '; '.join(analysis['matches'][:2])  # Show top 2 matches
                enhanced_result['reason'] += f" + Smart ID Analysis: {enhancement_details}"
            
            return enhanced_result
        
        return base_result
    
    def _map_country_to_iso(self, country_str: str) -> Optional[str]:
        if not country_str:
            return None
        country_clean = country_str.lower().strip()
        return self.country_to_iso.get(country_clean)
    
    def _extract_brand_from_name(self, hotel_name: str) -> Optional[str]:
        """Universal brand extraction from hotel names"""
        if not hotel_name:
            return None
        name_lower = hotel_name.lower()
        for brand in self.universal_brands:
            if brand in name_lower:
                return brand
        return None
    
    def _normalize_hotel_name(self, name: str) -> str:
        """Universal name normalization"""
        if pd.isna(name) or not name:
            return ""
        
        name = str(name).lower().strip()
        name = re.sub(r'[^\w\s]', ' ', name)
        
        # Convert numbers to words
        number_map = {'1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five'}
        for digit, word in number_map.items():
            name = name.replace(f' {digit} ', f' {word} ')
        
        words = [word for word in name.split() if word not in self.stop_words and len(word) > 1]
        
        # Remove duplicates while preserving order
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        
        return ' '.join(unique_words)
    
    def _load_api_enhancements(self, api_source: str) -> Dict:
        """Load API-specific enhancements if available"""
        # Simplified - focus on chain field detection
        enhancements = {
            'rate_hawk': {
                'has_chain_field': True,
                'smart_id_analysis': True,
                'confidence_boost': 0.03
            },
            'booking_com': {
                'has_chain_field': True,
                'smart_id_analysis': False,
                'confidence_boost': 0.03
            },
            'expedia': {
                'has_chain_field': True,
                'smart_id_analysis': False,
                'confidence_boost': 0.02
            },
            'universal': {
                'has_chain_field': True,
                'smart_id_analysis': True,
                'confidence_boost': 0.0
            }
        }
        return enhancements.get(api_source, enhancements['universal'])
    
    def _normalize_city_name(self, city: str) -> str:
        if not city:
            return ""
        city_mappings = {
            'dubaj': 'dubai',
            'krabi': 'krabi',
            'desroches': 'desroches island'
        }
        city_clean = city.lower().strip()
        return city_mappings.get(city_clean, city_clean)
    
    def load_reference_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📂 LOADING REFERENCE HOTELS")
        print("="*60)
        
        df = pd.read_csv(csv_path)
        print(f"✅ Loaded file: {csv_path}")
        print(f"📊 Found {len(df)} reference hotels")
        
        print("\n🧹 Processing reference data...")
        df['clean_location'] = df['Lokalizacja'].str.strip()
        df['clean_hotel'] = df['Hotel'].str.strip()
        
        df[['country_raw', 'city_raw']] = df['clean_location'].str.split(',', n=1, expand=True)
        df['country_raw'] = df['country_raw'].fillna('').str.strip()
        df['city_raw'] = df['city_raw'].fillna('').str.strip()
        
        print("🌍 Mapping countries to ISO codes...")
        df['country_iso'] = df['country_raw'].apply(self._map_country_to_iso)
        
        unmapped_countries = df[df['country_iso'].isna()]['country_raw'].unique()
        if len(unmapped_countries) > 0:
            print(f"⚠️  Unmapped countries: {list(unmapped_countries)}")
        
        df['city'] = df['city_raw'].apply(self._normalize_city_name)
        df['normalized_name'] = df['clean_hotel'].apply(self._normalize_hotel_name)
        df['brand'] = df['clean_hotel'].apply(self._extract_brand_from_name)
        df['reference_id'] = df.index.astype(str).str.zfill(3)
        
        country_stats = df['country_iso'].value_counts()
        print(f"📈 Countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count} hotels")
        
        brands_found = df['brand'].dropna().nunique()
        print(f"🏨 Universal brands detected: {brands_found}")
        
        self.reference_hotels = df
        print(f"✅ Processed {len(df)} reference hotels")
        return df
    
    def load_api_hotels(self, csv_path: str) -> pd.DataFrame:
        print("\n" + "="*60)
        print("📡 LOADING API HOTELS")
        print("="*60)
        
        df = pd.read_csv(csv_path)
        print(f"✅ Loaded file: {csv_path}")
        print(f"📊 Found {len(df):,} API hotels")
        
        print("\n🧹 Processing API data...")
        df['clean_name'] = df['name'].fillna('').str.strip()
        df['clean_city'] = df['city'].fillna('').str.strip()
        df['clean_address'] = df['address'].fillna('').str.strip()
        
        # Handle different possible chain field names
        chain_fields = ['hotel_chain', 'chain', 'brand', 'group']
        df['clean_chain'] = ''
        for field in chain_fields:
            if field in df.columns:
                df['clean_chain'] = df[field].fillna('').str.strip()
                break
        
        df['country_iso'] = df['country'].fillna('').str.upper().str.strip()
        df['normalized_name'] = df['clean_name'].apply(self._normalize_hotel_name)
        df['city_normalized'] = df['clean_city'].apply(self._normalize_city_name)
        df['brand_from_name'] = df['clean_name'].apply(self._extract_brand_from_name)
        
        df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['lng'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        country_stats = df['country_iso'].value_counts()
        print(f"🌍 API countries distribution:")
        for country, count in country_stats.head(5).items():
            country_name = self.iso_mappings.get(country, {}).get('official_name', country)
            print(f"   {country} ({country_name}): {count:,} hotels")
        
        coords_valid = df[['lat', 'lng']].notna().all(axis=1).sum()
        print(f"📍 Hotels with coordinates: {coords_valid:,}/{len(df):,} ({coords_valid/len(df)*100:.1f}%)")
        
        print(f"🔧 Smart ID analysis: {self.api_enhancements.get('smart_id_analysis', False)}")
        print(f"🎯 Enhancement features: {list(self.api_enhancements.keys())}")
        
        self.api_hotels = df
        print(f"✅ Processed {len(df):,} API hotels")
        return df
    
    def _calculate_universal_features(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        """Calculate features that work with any API"""
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return self._empty_features()
        
        features = {
            # String similarity (universal)
            'fuzz_ratio': fuzz.ratio(ref_name, api_name) / 100.0,
            'fuzz_partial': fuzz.partial_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_sort': fuzz.token_sort_ratio(ref_name, api_name) / 100.0,
            'fuzz_token_set': fuzz.token_set_ratio(ref_name, api_name) / 100.0,
            
            # Geographic (universal)
            'country_exact': ref_hotel['country_iso'] == api_hotel['country_iso'],
            'city_similarity': self._city_similarity(ref_hotel['city'], api_hotel['city_normalized']),
            
            # Brand detection (universal)
            'brand_from_name': self._brand_match_from_names(ref_hotel, api_hotel),
            'brand_chain_match': self._brand_chain_match(ref_hotel, api_hotel),
            
            # Additional universal features
            'soundex_match': self._soundex_match(ref_name, api_name),
            'word_intersection': self._word_intersection_ratio(ref_name, api_name),
            'premium_keywords': self._premium_keywords_overlap(ref_hotel['clean_hotel'], api_hotel['clean_name'])
        }
        
        return features
    
    def _empty_features(self) -> Dict:
        return {
            'fuzz_ratio': 0.0, 'fuzz_partial': 0.0, 'fuzz_token_sort': 0.0,
            'fuzz_token_set': 0.0, 'country_exact': False, 'city_similarity': 0.0,
            'brand_from_name': False, 'brand_chain_match': False, 'soundex_match': False,
            'word_intersection': 0.0, 'premium_keywords': 0.0
        }
    
    def _city_similarity(self, city1: str, city2: str) -> float:
        if not city1 or not city2:
            return 0.0
        return fuzz.ratio(city1.lower(), city2.lower()) / 100.0
    
    def _brand_match_from_names(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> bool:
        """Brand matching from hotel names (universal)"""
        ref_brand = ref_hotel.get('brand')
        api_brand = api_hotel.get('brand_from_name')
        
        if not ref_brand or not api_brand:
            return False
        
        return fuzz.ratio(ref_brand, api_brand) / 100.0 > 0.85
    
    def _brand_chain_match(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> bool:
        """Brand matching using chain field (if available)"""
        ref_brand = ref_hotel.get('brand')
        api_chain = api_hotel.get('clean_chain', '').lower()
        
        if not ref_brand or not api_chain:
            return False
        
        return ref_brand.lower() in api_chain or fuzz.ratio(ref_brand, api_chain) / 100.0 > 0.80
    
    def _soundex_match(self, name1: str, name2: str) -> bool:
        if not name1 or not name2:
            return False
        try:
            soundex1 = phonetics.soundex(name1.replace(' ', ''))
            soundex2 = phonetics.soundex(name2.replace(' ', ''))
            return soundex1 == soundex2
        except:
            return False
    
    def _word_intersection_ratio(self, name1: str, name2: str) -> float:
        if not name1 or not name2:
            return 0.0
        
        words1 = set(name1.lower().split())
        words2 = set(name2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _premium_keywords_overlap(self, name1: str, name2: str) -> float:
        name1_lower = name1.lower()
        name2_lower = name2.lower()
        
        name1_keywords = sum(1 for kw in self.premium_keywords if kw in name1_lower)
        name2_keywords = sum(1 for kw in self.premium_keywords if kw in name2_lower)
        
        if name1_keywords == 0 and name2_keywords == 0:
            return 0.0
        
        common_keywords = sum(1 for kw in self.premium_keywords if kw in name1_lower and kw in name2_lower)
        total_keywords = max(name1_keywords, name2_keywords)
        
        return common_keywords / total_keywords if total_keywords > 0 else 0.0
    
    def _universal_high_confidence_rules(self, features: Dict) -> Optional[Dict]:
        """High confidence rules using only universal features"""
        
        # Brand match + country + high name similarity
        if features['brand_chain_match'] and features['country_exact'] and features['fuzz_token_sort'] > 0.85:
            return {
                'match': True,
                'confidence': 0.95,
                'reason': 'Universal Brand + Location + High Name Similarity'
            }
        
        if features['brand_from_name'] and features['country_exact'] and features['city_similarity'] > 0.8:
            return {
                'match': True,
                'confidence': 0.93,
                'reason': 'Universal Brand Match + Perfect Location'
            }
        
        # Very high name similarity + country
        if features['fuzz_token_sort'] > 0.92 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.90,
                'reason': f'Universal Name Almost Identical + Same Country ({features["fuzz_token_sort"]:.2f})'
            }
        
        # Token set high + location
        if features['fuzz_token_set'] > 0.90 and features['country_exact'] and features['city_similarity'] > 0.7:
            return {
                'match': True,
                'confidence': 0.88,
                'reason': f'Universal Same Words Different Order + Location ({features["fuzz_token_set"]:.2f})'
            }
        
        return None
    
    def _universal_medium_confidence_rules(self, features: Dict) -> Optional[Dict]:
        """Medium confidence rules using universal features"""
        
        # Brand similarity + good name match
        if (features['brand_from_name'] or features['brand_chain_match']) and features['fuzz_token_sort'] > 0.75 and features['country_exact']:
            confidence = 0.75 + (features['fuzz_token_sort'] - 0.75) * 0.4
            return {
                'match': True,
                'confidence': min(confidence, 0.87),
                'reason': f'Universal Brand + Strong Name Similarity ({features["fuzz_token_sort"]:.2f})'
            }
        
        # High partial ratio + context
        if features['fuzz_partial'] > 0.85 and features['country_exact'] and features['premium_keywords'] > 0.3:
            return {
                'match': True,
                'confidence': 0.77,
                'reason': f'Universal Name Contains + Geographic + Premium ({features["fuzz_partial"]:.2f})'
            }
        
        # Token set + location
        if features['fuzz_token_set'] > 0.80 and features['country_exact'] and features['city_similarity'] > 0.6:
            confidence = 0.65 + (features['fuzz_token_set'] - 0.80) * 0.5
            return {
                'match': True,
                'confidence': min(confidence, 0.83),
                'reason': f'Universal Word Shuffle + Location ({features["fuzz_token_set"]:.2f})'
            }
        
        return None
    
    def _universal_lower_confidence_rules(self, features: Dict) -> Optional[Dict]:
        """Lower confidence rules using universal features"""
        
        # Phonetic + partial match
        if features['soundex_match'] and features['fuzz_partial'] > 0.75 and features['country_exact']:
            return {
                'match': True,
                'confidence': 0.62,
                'reason': f'Universal Phonetic + Partial Match ({features["fuzz_partial"]:.2f})'
            }
        
        # Word intersection + premium + location
        if (features['word_intersection'] > 0.6 and features['premium_keywords'] > 0.4 and 
            features['country_exact'] and features['city_similarity'] > 0.5):
            return {
                'match': True,
                'confidence': 0.58,
                'reason': f'Universal Word Intersection + Premium + Location ({features["word_intersection"]:.2f})'
            }
        
        return None
    
    def _universal_matching_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """Core universal matching - works with any API"""
        features = self._calculate_universal_features(ref_hotel, api_hotel)
        
        # Try high confidence rules first
        result = self._universal_high_confidence_rules(features)
        if result:
            return result
        
        # Try medium confidence rules
        result = self._universal_medium_confidence_rules(features)
        if result:
            return result
        
        # Try lower confidence rules
        result = self._universal_lower_confidence_rules(features)
        if result:
            return result
        
        return None
    
    def _try_api_enhancements(self, base_result: Dict, ref_hotel: pd.Series, api_hotel: pd.Series) -> Dict:
        """Try to enhance result with API-specific features"""
        if not base_result or not self.api_enhancements.get('has_structured_ids'):
            return base_result
        
        enhanced_result = base_result.copy()
        
        # Try API ID enhancement (Rate Hawk only)
        if self.api_source == 'rate_hawk' and 'id' in api_hotel:
            api_id = str(api_hotel.get('id', '')).lower()
            ref_brand = ref_hotel.get('brand', '')
            
            # Check for brand pattern in API ID
            for pattern, brand in self.api_enhancements.get('id_brand_patterns', {}).items():
                if pattern in api_id and ref_brand and fuzz.ratio(ref_brand, brand.lower()) / 100.0 > 0.85:
                    enhanced_result['confidence'] = min(enhanced_result['confidence'] + 0.05, 0.98)
                    enhanced_result['reason'] += ' + API ID Brand Enhancement'
                    break
        
        return enhanced_result
    
    def _universal_name_only_decision(self, ref_hotel: pd.Series, api_hotel: pd.Series) -> Optional[Dict]:
        """Universal name-only matching for fallback"""
        ref_name = ref_hotel['normalized_name']
        api_name = api_hotel['normalized_name']
        
        if not ref_name or not api_name:
            return None
        
        # Check brand match first
        ref_brand = ref_hotel.get('brand')
        api_brand = api_hotel.get('brand_from_name')
        brand_match = ref_brand and api_brand and fuzz.ratio(ref_brand, api_brand) / 100.0 > 0.85
        
        if brand_match:
            name_sim = fuzz.token_sort_ratio(ref_name, api_name) / 100.0
            confidence = 0.70 + name_sim * 0.15
            return {
                'match': True,
                'confidence': min(confidence, 0.88),
                'reason': f'Universal Brand + Name Similarity (Fallback) ({name_sim:.2f})'
            }
        
        # Regular name-only matching
        fuzz_token_sort = fuzz.token_sort_ratio(ref_name, api_name) / 100.0
        fuzz_token_set = fuzz.token_set_ratio(ref_name, api_name) / 100.0
        
        if fuzz_token_sort > 0.88:
            return {
                'match': True,
                'confidence': 0.75,
                'reason': f'Universal Names Almost Identical (Fallback) ({fuzz_token_sort:.2f})'
            }
        
        if fuzz_token_set > 0.85:
            return {
                'match': True,
                'confidence': 0.72,
                'reason': f'Universal Same Words Different Order (Fallback) ({fuzz_token_set:.2f})'
            }
        
        return None
    
    def run_matching(self) -> List[Dict]:
        print("\n" + "="*60)
        print("🎯 RUNNING PRODUCTION HOTEL MATCHING")
        print("="*60)
        
        matches = []
        api_by_country = self.api_hotels.groupby('country_iso')
        total_hotels = len(self.reference_hotels)
        
        # Algorithm performance tracking
        algorithm_stats = {
            'Universal Brand + Location + High Name Similarity': 0,
            'Universal Brand Match + Perfect Location': 0,
            'Universal Name Almost Identical + Same Country': 0,
            'Universal Same Words Different Order + Location': 0,
            'Universal Brand + Strong Name Similarity': 0,
            'Universal Name Contains + Geographic + Premium': 0,
            'Universal Word Shuffle + Location': 0,
            'Universal Phonetic + Partial Match': 0,
            'Universal Word Intersection + Premium + Location': 0,
            'Universal Brand + Name Similarity (Fallback)': 0,
            'Universal Names Almost Identical (Fallback)': 0,
            'Universal Same Words Different Order (Fallback)': 0,
            'API Enhanced Matches': 0
        }
        
        print(f"📊 Processing {total_hotels} reference hotels")
        print(f"🔧 Universal core + {self.api_source} enhancements")
        print(f"🌍 API countries available: {len(api_by_country.groups)}")
        
        total_comparisons = 0
        skipped_no_country = 0
        skipped_no_api_country = 0
        enhanced_matches = 0
        
        for idx, (_, ref_hotel) in enumerate(self.reference_hotels.iterrows(), 1):
            ref_iso = ref_hotel['country_iso']
            ref_name = ref_hotel['clean_hotel']
            
            print(f"\n[{idx:2d}/{total_hotels}] Processing: {ref_name[:50]}...")
            print(f"    🌍 Country: {ref_iso}")
            
            best_match = None
            best_confidence = 0.0
            
            # Strategy 1: Universal ISO-based matching (80% baseline)
            if ref_iso and ref_iso in api_by_country.groups:
                candidates = api_by_country.get_group(ref_iso)
                print(f"    📍 Universal ISO candidates: {len(candidates):,}")
                
                candidates_checked = 0
                for _, api_hotel in candidates.iterrows():
                    candidates_checked += 1
                    total_comparisons += 1
                    
                    # Universal core matching
                    result = self._universal_matching_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel.get('id', ''),
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'Universal ISO-based'
                        }
                        
                        # Try smart API enhancements
                        enhanced_result = self._try_smart_api_enhancements(result, ref_hotel, api_hotel)
                        if enhanced_result['confidence'] > result['confidence']:
                            best_match['confidence'] = enhanced_result['confidence']
                            best_match['match_reason'] = enhanced_result['reason']
                            enhanced_matches += 1
                    
                    if candidates_checked % 1000 == 0:
                        print(f"    ... checked {candidates_checked:,}/{len(candidates):,} candidates")
            
            # Strategy 2: Universal name-only fallback
            if not best_match or best_confidence < 0.75:
                if not ref_iso:
                    print(f"    ⚠️  No ISO mapping - trying universal name-only fallback")
                elif ref_iso not in api_by_country.groups:
                    print(f"    ⚠️  No API hotels for {ref_iso} - trying universal fallback")
                else:
                    print(f"    🔄 Low confidence ({best_confidence:.3f}) - trying universal fallback")
                
                # Use top 3000 hotels by similarity for performance
                name_similarities = []
                for _, api_hotel in self.api_hotels.iterrows():
                    if api_hotel['normalized_name']:
                        sim = fuzz.ratio(ref_hotel['normalized_name'], api_hotel['normalized_name']) / 100.0
                        if sim > 0.4:
                            name_similarities.append((sim, api_hotel))
                
                name_similarities.sort(key=lambda x: x[0], reverse=True)
                top_candidates = name_similarities[:3000]
                
                print(f"    🔍 Universal name-only candidates: {len(top_candidates)}")
                
                for sim_score, api_hotel in top_candidates:
                    total_comparisons += 1
                    
                    result = self._universal_name_only_decision(ref_hotel, api_hotel)
                    
                    if result and result['confidence'] > best_confidence:
                        best_confidence = result['confidence']
                        best_match = {
                            'reference_id': ref_hotel['reference_id'],
                            'reference_name': ref_hotel['clean_hotel'],
                            'api_id': api_hotel.get('id', ''),
                            'api_name': api_hotel['clean_name'],
                            'api_chain': api_hotel['clean_chain'],
                            'api_city': api_hotel['clean_city'],
                            'api_address': api_hotel['clean_address'],
                            'api_latitude': api_hotel.get('lat'),
                            'api_longitude': api_hotel.get('lng'),
                            'confidence': result['confidence'],
                            'match_reason': result['reason'],
                            'match_strategy': 'Universal Name-only Fallback'
                        }
                        
                        # Try smart enhancements on fallback too
                        enhanced_result = self._try_smart_api_enhancements(result, ref_hotel, api_hotel)
                        if enhanced_result['confidence'] > result['confidence']:
                            best_match['confidence'] = enhanced_result['confidence']
                            best_match['match_reason'] = enhanced_result['reason']
                            enhanced_matches += 1
                
                if len(top_candidates) == 0:
                    if not ref_iso:
                        skipped_no_country += 1
                    else:
                        skipped_no_api_country += 1
            
            # Record result
            if best_match and best_confidence >= 0.55:
                matches.append(best_match)
                
                # Track algorithm performance
                reason = best_match['match_reason']
                for algo_name in algorithm_stats.keys():
                    if algo_name.replace(' + API Enhanced', '') in reason:
                        algorithm_stats[algo_name] += 1
                        break
                
                if 'API ID' in reason or 'Enhancement' in reason:
                    algorithm_stats['API Enhanced Matches'] += 1
                
                confidence_level = "🟢 HIGH" if best_confidence >= 0.85 else "🟡 MEDIUM" if best_confidence >= 0.70 else "🔴 LOW"
                print(f"    ✅ MATCH FOUND: {best_match['api_name'][:40]}...")
                print(f"    📊 Confidence: {best_confidence:.3f} ({confidence_level})")
                print(f"    🎯 Strategy: {best_match['match_strategy']}")
                print(f"    📝 Reason: {best_match['match_reason'][:60]}...")
            else:
                print(f"    ❌ No match found (best confidence: {best_confidence:.3f})")
        
        print(f"\n" + "="*60)
        print("🎯 PRODUCTION MATCHING COMPLETED")
        print("="*60)
        print(f"✅ Total matches found: {len(matches)}")
        print(f"📊 Coverage: {len(matches)}/{total_hotels} ({len(matches)/total_hotels*100:.1f}%)")
        print(f"⚡ Performance stats:")
        print(f"   📊 Total comparisons: {total_comparisons:,}")
        print(f"   🚫 Skipped (no ISO): {skipped_no_country}")
        print(f"   🚫 Skipped (no API country): {skipped_no_api_country}")
        print(f"   🔧 Smart ID enhancements applied: {enhanced_matches}")
        
        print(f"\n🧠 ALGORITHM EFFECTIVENESS:")
        for algo_name, count in algorithm_stats.items():
            if count > 0:
                percentage = (count / len(matches)) * 100 if matches else 0
                print(f"   📈 {algo_name}: {count} matches ({percentage:.1f}%)")
        
        # Strategy breakdown
        if matches:
            iso_matches = sum(1 for m in matches if m['match_strategy'] == 'Universal ISO-based')
            name_matches = sum(1 for m in matches if m['match_strategy'] == 'Universal Name-only Fallback')
            
            print(f"\n📊 STRATEGY BREAKDOWN:")
            print(f"   🌍 Universal ISO-based: {iso_matches} ({iso_matches/len(matches)*100:.1f}%)")
            print(f"   🔍 Universal Name-only Fallback: {name_matches} ({name_matches/len(matches)*100:.1f}%)")
            
            if enhanced_matches > 0:
                print(f"   🔧 Smart ID enhancements: {enhanced_matches} ({enhanced_matches/len(matches)*100:.1f}%)")
        
        if matches:
            confidences = [m['confidence'] for m in matches]
            high_conf = sum(1 for c in confidences if c >= 0.85)
            med_conf = sum(1 for c in confidences if 0.70 <= c < 0.85)
            low_conf = sum(1 for c in confidences if c < 0.70)
            
            print(f"\n📈 CONFIDENCE DISTRIBUTION:")
            print(f"   🟢 High (≥0.85): {high_conf}")
            print(f"   🟡 Medium (0.70-0.84): {med_conf}")
            print(f"   🔴 Low (<0.70): {low_conf}")
        
        return matches
    
    def create_full_reference_with_results(self, matches: List[Dict]) -> pd.DataFrame:
        """Create comprehensive results with reference + API data"""
        results_df = self.reference_hotels[['Lokalizacja', 'Hotel', 'reference_id', 'country_raw', 'city', 'country_iso']].copy()
        
        results_df = results_df.rename(columns={
            'country_raw': 'reference_country',
            'city': 'reference_city'
        })
        
        match_dict = {match['reference_id']: match for match in matches}
        
        results_df['api_id'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_id', ''))
        results_df['api_name'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_name', ''))
        results_df['api_chain'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_chain', ''))
        results_df['api_city'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_city', ''))
        results_df['api_address'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_address', ''))
        results_df['api_latitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_latitude', ''))
        results_df['api_longitude'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('api_longitude', ''))
        results_df['confidence'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('confidence', ''))
        results_df['match_reason'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_reason', ''))
        results_df['match_strategy'] = results_df['reference_id'].map(lambda x: match_dict.get(x, {}).get('match_strategy', ''))
        results_df['matched'] = results_df['api_id'] != ''
        
        return results_df
    
    def run_complete_matching(self, reference_csv: str, api_csv: str) -> Dict:
        print("🚀" + "="*58 + "🚀")
        print("🚀 PRODUCTION HOTEL MATCHER - UNIVERSAL CORE + API ENHANCEMENT 🚀")
        print("🚀" + "="*58 + "🚀")
        print(f"Architecture: Universal Core (80% baseline) + {self.api_source.title()} Enhancements")
        print("Features: ISO pre-filtering + Universal algorithms + API-specific boosters")
        
        self.load_reference_hotels(reference_csv)
        if self.reference_hotels is None:
            return {'error': 'Failed to load reference hotels'}
            
        self.load_api_hotels(api_csv)
        if self.api_hotels is None:
            return {'error': 'Failed to load API hotels'}
        
        matches = self.run_matching()
        
        results = {
            'total_reference_hotels': len(self.reference_hotels),
            'total_api_hotels': len(self.api_hotels),
            'total_matches': len(matches),
            'coverage_percentage': len(matches) / len(self.reference_hotels) * 100,
            'matches': matches,
            'api_source': self.api_source
        }
        
        return results
    
    def save_results(self, results: Dict, output_prefix: str = "production_hotel_mapping"):
        print("\n" + "="*60)
        print("💾 SAVING PRODUCTION RESULTS")
        print("="*60)
        
        matches = results['matches']
        api_source = results.get('api_source', 'universal')
        output_prefix = f"{output_prefix}_{api_source}"
        
        if matches:
            matches_df = pd.DataFrame(matches)
            matches_file = f"{output_prefix}_matches.csv"
            matches_df.to_csv(matches_file, index=False, encoding='utf-8-sig')
            print(f"✅ Saved matches: {matches_file}")
        
        full_results = self.create_full_reference_with_results(matches)
        full_file = f"{output_prefix}_full_reference.csv"
        full_results.to_csv(full_file, index=False, encoding='utf-8-sig')
        print(f"✅ Saved full reference: {full_file}")
        
        summary = pd.DataFrame({
            'metric': ['API Source', 'Total Reference Hotels', 'Total API Hotels', 'Total Matches', 'Coverage Percentage'],
            'value': [api_source, results['total_reference_hotels'], results['total_api_hotels'], 
                     results['total_matches'], f"{results['coverage_percentage']:.1f}%"]
        })
        summary_file = f"{output_prefix}_summary.csv"
        summary.to_csv(summary_file, index=False, encoding='utf-8-sig')
        print(f"✅ Saved summary: {summary_file}")
        
        print(f"\n🎯 PRODUCTION FINAL RESULTS:")
        print(f"   🔧 API Source: {api_source}")
        print(f"   📚 Reference hotels: {results['total_reference_hotels']}")
        print(f"   📡 API hotels: {results['total_api_hotels']:,}")
        print(f"   ✅ Matches found: {results['total_matches']}")
        print(f"   📊 Coverage: {results['coverage_percentage']:.1f}%")


if __name__ == "__main__":
    # Example usage with different API sources
    
    # Rate Hawk with enhancements
    print("Testing with Rate Hawk API (with ID enhancements)...")
    matcher_rh = ProductionHotelMatcher(api_source='rate_hawk')
    results_rh = matcher_rh.run_complete_matching(
        reference_csv="lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"
    )
    matcher_rh.save_results(results_rh)
    
    # Universal mode (works with any API)
    print("\n" + "="*60)
    print("Testing in Universal mode (no API-specific features)...")
    matcher_universal = ProductionHotelMatcher(api_source='universal')
    results_universal = matcher_universal.run_complete_matching(
        reference_csv="lista_referencyjna.csv",
        api_csv="01_api_rate_hawk.csv"  # Same data, but treated as universal
    )
    matcher_universal.save_results(results_universal)
    
    print("\n" + "="*60)
    print("🏁 PRODUCTION HOTEL MATCHING COMPLETED!")
    print("="*60)
    print(f"Rate Hawk Enhanced: {results_rh['coverage_percentage']:.1f}%")
    print(f"Universal Mode: {results_universal['coverage_percentage']:.1f}%")
    print(f"Enhancement Gain: +{results_rh['coverage_percentage'] - results_universal['coverage_percentage']:.1f}%")

Testing with Rate Hawk API (with ID enhancements)...
🚀 PRODUCTION HOTEL MATCHER - UNIVERSAL CORE + API ENHANCEMENT 🚀
Architecture: Universal Core (80% baseline) + Rate_Hawk Enhancements
Features: ISO pre-filtering + Universal algorithms + API-specific boosters

📂 LOADING REFERENCE HOTELS
✅ Loaded file: lista_referencyjna.csv
📊 Found 99 reference hotels

🧹 Processing reference data...
🌍 Mapping countries to ISO codes...
⚠️  Unmapped countries: ['The Ritz-Carlton Yacht Collection']
📈 Countries distribution:
   GR (Greece): 13 hotels
   MV (Maldives): 12 hotels
   AE (United Arab Emirates): 11 hotels
   ES (Spain): 8 hotels
   TH (Thailand): 7 hotels
🏨 Universal brands detected: 26
✅ Processed 99 reference hotels

📡 LOADING API HOTELS
✅ Loaded file: 01_api_rate_hawk.csv
📊 Found 95,463 API hotels

🧹 Processing API data...
🌍 API countries distribution:
   AE (United Arab Emirates): 12,942 hotels
   GR (Greece): 8,592 hotels
   TH (Thailand): 8,063 hotels
   US (United States): 7,858 hotels
