In [6]:
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
import pandas as pd
import re
from dataclasses import dataclass, asdict
from typing import List, Dict
import logging
import hashlib
import requests
from bs4 import BeautifulSoup

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class AcupuncturistData:
    """Data structure for acupuncturist information"""
    full_name: str = ""
    business_name: str = ""
    website: str = ""
    email: str = ""
    phone: str = ""
    location: str = ""
    source_url: str = ""
    
    def to_dict(self) -> Dict:
        return asdict(self)
    
    def get_hash(self) -> str:
        """Generate unique hash for deduplication"""
        name = self.full_name.lower().strip()
        business = self.business_name.lower().strip()
        phone = re.sub(r'[^\d]', '', self.phone)
        email = self.email.lower().strip()
        key = f"{name}|{business}|{phone}|{email}"
        return hashlib.md5(key.encode()).hexdigest()

class SimpleFallbackScraper:
    """Enhanced fallback scraper for Swedish healthcare practitioners"""
    
    def __init__(self, search_category="akupunktur"):
        self.session = requests.Session()
        self.search_category = search_category.lower()
        
        # Swedish user agent
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'Accept-Language': 'sv-SE,sv;q=0.9,en;q=0.8',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Connection': 'keep-alive',
        })
        self.results = []
        
        # Define search terms for different categories
        self.category_terms = {
            'akupunktur': ['akupunktur', 'akupunktör', 'traditionell kinesisk medicin', 'tcm'],
            'osteopath': ['osteopat', 'osteopati', 'osteopatisk', 'osteopathic'],
            'naprapath': ['naprapati', 'naprapatisk', 'naprapath'],
            'kiropraktor': ['kiropraktor', 'kiropraktik', 'chiropractic'],
            'fysioterapeut': ['fysioterapi', 'fysioterapeut', 'physiotherapy'],
            'massageterapeut': ['massage', 'massageterapeut', 'massör'],
            'homeopat': ['homeopati', 'homeopatisk', 'homeopathic'],
            'naturterapeut': ['naturterapi', 'naturterapeut', 'naturläkare']
        }
    
    def get_search_terms(self):
        """Get search terms for current category"""
        return self.category_terms.get(self.search_category, [self.search_category])
    
    def extract_contact_info(self, text):
        """Enhanced extraction of email, phone, and website"""
        # Email patterns
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        
        # Swedish phone patterns (more comprehensive)
        phone_patterns = [
            r'\b(?:\+46|0)(?:\s*[-/]?\s*)?(?:\d{1,4}(?:\s*[-/]?\s*)?){2,4}\b',
            r'\b\d{2,4}[-/\s]\d{2,4}[-/\s]\d{2,4}\b',
            r'\b\d{3}\s*-\s*\d{2}\s*\d{2}\s*\d{2}\b',  # Swedish format
            r'\b08[-\s]\d{3}\s*\d{2}\s*\d{2}\b'  # Stockholm numbers
        ]
        
        phones = []
        for pattern in phone_patterns:
            phones.extend(re.findall(pattern, text))
        
        # Website patterns
        website_patterns = [
            r'https?://[^\s<>"\']+',
            r'www\.[^\s<>"\']+',
            r'\b[a-zA-Z0-9-]+\.(?:se|com|net|org|nu)\b'
        ]
        
        websites = []
        for pattern in website_patterns:
            websites.extend(re.findall(pattern, text))
        
        return {
            'email': emails[0] if emails else "",
            'phone': phones[0] if phones else "",
            'website': websites[0] if websites else ""
        }
    
    def extract_practitioner_name(self, text):
        """Extract individual practitioner names"""
        # Common Swedish name patterns
        name_patterns = [
            r'\b[A-ZÅÄÖ][a-zåäö]+\s+[A-ZÅÄÖ][a-zåäö]+(?:\s+[A-ZÅÄÖ][a-zåäö]+)?\b',  # First Last [Middle]
            r'\b(?:Dr|Leg|Med)\.\s*[A-ZÅÄÖ][a-zåäö]+\s+[A-ZÅÄÖ][a-zåäö]+\b',  # Dr. First Last
            r'\b[A-ZÅÄÖ][a-zåäö]+\s+[A-ZÅÄÖ][a-zåäö]+-[A-ZÅÄÖ][a-zåäö]+\b'  # Hyphenated names
        ]
        
        names = []
        for pattern in name_patterns:
            found_names = re.findall(pattern, text)
            for name in found_names:
                # Filter out common words
                if not any(word in name.lower() for word in ['akupunktur', 'klinik', 'center', 'massage', 'terapi']):
                    names.append(name)
        
        return names[0] if names else ""
    
    def extract_location(self, text):
        """Extract Swedish locations"""
        # Swedish cities and regions
        swedish_locations = [
            'Stockholm', 'Göteborg', 'Malmö', 'Uppsala', 'Västerås', 'Örebro', 'Linköping',
            'Helsingborg', 'Jönköping', 'Norrköping', 'Lund', 'Umeå', 'Gävle', 'Borås',
            'Sundsvall', 'Eskilstuna', 'Halmstad', 'Växjö', 'Karlstad', 'Skövde',
            'Trollhättan', 'Uddevalla', 'Motala', 'Borlänge', 'Tumba', 'Falun',
            'Kalmar', 'Kristianstad', 'Karlskrona', 'Landskrona', 'Trelleborg', 'Ystad'
        ]
        
        # Look for postal codes + city
        postal_pattern = r'\b\d{3}\s?\d{2}\s+([A-ZÅÄÖ][a-zåäö]+)\b'
        postal_matches = re.findall(postal_pattern, text)
        
        if postal_matches:
            return postal_matches[0]
        
        # Look for known cities
        for city in swedish_locations:
            if city.lower() in text.lower():
                return city
        
        return ""
    
    def scrape_simple_hitta(self):
        """Simple scrape of Hitta.se mobile version"""
        try:
            import urllib.parse
            
            # Fix URL encoding for Swedish characters
            search_term = urllib.parse.quote("akupunktur", safe='')
            url = f"https://www.hitta.se/sök?q={search_term}"
            
            print(f"Trying Hitta.se: {url}")
            time.sleep(2)  # Be respectful
            
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for any mentions of acupuncture businesses
            text_content = soup.get_text()
            
            # Find lines with akupunktur
            lines = text_content.split('\n')
            business_names = []
            
            for line in lines:
                line = line.strip()
                if ('akupunktur' in line.lower() or 'akupunkt' in line.lower()) and len(line) > 10 and len(line) < 100:
                    # Filter out navigation and common website text
                    if not any(skip_word in line.lower() for skip_word in ['sök', 'meny', 'logga in', 'cookies', 'gdpr']):
                        business_names.append(line)
            
            # Create entries from found names
            for name in set(business_names):  # Remove duplicates
                if name and len(name) > 5:
                    data = AcupuncturistData()
                    data.source_url = url
                    data.business_name = name
                    data.location = "Sverige"  # Default location
                    
                    # Try to extract contact info from surrounding text
                    email, phone = self.extract_contact_info(text_content)
                    if email:
                        data.email = email
                    if phone:
                        data.phone = phone
                    
                    self.results.append(data)
            
            logger.info(f"Hitta.se found {len(business_names)} potential businesses")
            
        except Exception as e:
            logger.error(f"Hitta.se scraper failed: {e}")
    
    def scrape_manual_list(self):
        """Skip demonstration data - focus on real data only"""
        logger.info("Skipping demonstration data - collecting real practitioners only")
        pass
    
    def scrape_enhanced_hitta(self):
        """Enhanced scrape of Hitta.se with aggressive real data extraction"""
        try:
            import urllib.parse
            
            search_terms = self.get_search_terms()
            
            for term in search_terms[:3]:  # Try more terms for better coverage
                search_term_encoded = urllib.parse.quote(term, safe='')
                url = f"https://www.hitta.se/sök?q={search_term_encoded}"
                
                print(f"   Searching Hitta.se for: {term}")
                time.sleep(2)  # Reduced delay for faster collection
                
                try:
                    response = self.session.get(url, timeout=15)
                    if response.status_code != 200:
                        continue
                    
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # More aggressive text parsing for real businesses
                    page_text = soup.get_text()
                    
                    # Look for Swedish business patterns
                    lines = page_text.split('\n')
                    
                    for i, line in enumerate(lines):
                        line = line.strip()
                        
                        # Find lines containing our search terms
                        if any(search_term in line.lower() for search_term in search_terms):
                            if 10 < len(line) < 100:  # Reasonable business name length
                                # Skip navigation/UI elements
                                if not any(skip in line.lower() for skip in 
                                         ['sök', 'meny', 'logga', 'cookies', 'visa', 'hitta']):
                                    
                                    data = AcupuncturistData()
                                    data.source_url = url
                                    data.business_name = line
                                    
                                    # Look for contact info in surrounding lines
                                    context_lines = lines[max(0, i-2):i+3]
                                    context_text = ' '.join(context_lines)
                                    
                                    # Extract contact info
                                    contact_info = self.extract_contact_info(context_text)
                                    data.email = contact_info['email']
                                    data.phone = contact_info['phone']
                                    data.website = contact_info['website']
                                    
                                    # Extract location
                                    data.location = self.extract_location(context_text)
                                    
                                    # Try to extract practitioner name from context
                                    data.full_name = self.extract_practitioner_name(context_text)
                                    
                                    self.results.append(data)
                                    logger.info(f"Found real business: {line}")
                
                except Exception as e:
                    logger.debug(f"Error scraping {url}: {e}")
                    continue
            
            found_count = len([r for r in self.results if 'hitta.se' in r.source_url])
            logger.info(f"Hitta.se real data collection: {found_count} entries for {self.search_category}")
            
        except Exception as e:
            logger.error(f"Enhanced Hitta.se scraper failed: {e}")
    
    def scrape_enhanced_directories(self):
        """Enhanced scraping of Swedish business directories"""
        try:
            search_terms = self.get_search_terms()
            primary_term = search_terms[0]
            
            # Swedish business directories to try
            directories = [
                f"https://www.merinfo.se/search?q={primary_term}",
                f"https://www.allabolag.se/what/{primary_term}",
                f"https://www.ratsit.se/sök?q={primary_term}"
            ]
            
            for url in directories:
                try:
                    print(f"   Trying directory: {url.split('//')[1].split('/')[0]}")
                    time.sleep(4)  # Respectful delay
                    
                    response = self.session.get(url, timeout=15)
                    if response.status_code != 200:
                        continue
                    
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Look for business listings with contact information
                    business_sections = soup.find_all(['div', 'article', 'section'], 
                                                    class_=re.compile(r'company|business|result|hit'))
                    
                    for section in business_sections[:3]:  # Limit to avoid rate limiting
                        section_text = section.get_text()
                        
                        # Check if this section contains our search terms
                        if not any(term in section_text.lower() for term in search_terms):
                            continue
                        
                        data = AcupuncturistData()
                        data.source_url = url
                        
                        # Extract names
                        data.full_name = self.extract_practitioner_name(section_text)
                        
                        # Extract business name
                        lines = section_text.split('\n')
                        for line in lines:
                            line = line.strip()
                            if any(term in line.lower() for term in search_terms) and 10 < len(line) < 100:
                                data.business_name = line
                                break
                        
                        # Extract contact information
                        contact_info = self.extract_contact_info(section_text)
                        data.email = contact_info['email']
                        data.phone = contact_info['phone']
                        data.website = contact_info['website']
                        
                        # Extract location
                        data.location = self.extract_location(section_text)
                        
                        if data.business_name or data.full_name:
                            self.results.append(data)
                    
                    # Don't try all directories if we found data
                    directory_results = len([r for r in self.results if url.split('//')[1].split('/')[0] in r.source_url])
                    if directory_results > 0:
                        logger.info(f"Found {directory_results} entries from {url.split('//')[1].split('/')[0]}")
                        break
                        
                except Exception as e:
                    logger.debug(f"Directory {url} failed: {e}")
                    continue
                    
        except Exception as e:
            logger.error(f"Enhanced directories scraper failed: {e}")
    

    
    def scrape_google_search(self):
        """Category-aware Google search for Swedish healthcare practitioners"""
        try:
            search_terms = self.get_search_terms()
            cities = ['stockholm', 'göteborg', 'malmö']
            
            for city in cities[:2]:  # Limit cities to avoid rate limiting
                for term in search_terms[:1]:  # Use primary term only
                    try:
                        time.sleep(5)  # Respectful delay
                        
                        query = f"{term} {city}"
                        search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}&num=3"
                        
                        print(f"   Google search: {query}")
                        
                        headers = {
                            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                            'Accept-Language': 'sv-SE,sv;q=0.9,en;q=0.8',
                            'Accept-Encoding': 'gzip, deflate',
                            'DNT': '1',
                            'Connection': 'keep-alive',
                            'Upgrade-Insecure-Requests': '1',
                        }
                        
                        response = self.session.get(search_url, headers=headers, timeout=15)
                        
                        if response.status_code == 200:
                            # Basic parsing for business references
                            text = response.text.lower()
                            if any(search_term in text for search_term in search_terms):
                                # Create basic entry
                                data = AcupuncturistData()
                                data.source_url = search_url
                                data.business_name = f"{term.title()} {city.title()}"
                                data.location = city.title()
                                
                                self.results.append(data)
                                logger.info(f"Google found reference for {term} in {city}")
                        
                    except Exception as e:
                        logger.debug(f"Google search failed for {term} {city}: {e}")
                        continue
            
            google_count = len([r for r in self.results if 'google' in r.source_url.lower()])
            logger.info(f"Google search completed with {google_count} references")
            
        except Exception as e:
            logger.error(f"Google search failed: {e}")
    
    def export_results(self, filename=None):
        """Export results with all fields"""
        if not self.results:
            return None
        
        if filename is None:
            filename = f"{self.search_category}_practitioners.xlsx"
        
        # Convert to DataFrame with all fields
        data_dicts = []
        for result in self.results:
            data_dict = result.to_dict()
            data_dicts.append(data_dict)
        
        df = pd.DataFrame(data_dicts)
        
        # Reorder columns to match requirements
        column_order = ['full_name', 'business_name', 'website', 'email', 'phone', 'location', 'source_url']
        df = df.reindex(columns=column_order)
        
        # Clean up data
        for col in df.columns:
            if col != 'source_url':
                df[col] = df[col].fillna('').astype(str).str.strip()
        
        df.to_excel(filename, index=False)
        logger.info(f"Exported {len(df)} {self.search_category} practitioners to {filename}")
        
        return df

class StealthBrowser:
    """Enhanced stealth browser setup"""
    
    def __init__(self, use_proxy=False, proxy_config=None):
        self.driver = None
        self.use_proxy = use_proxy
        self.proxy_config = proxy_config
        self.setup_browser()
    
    def setup_browser(self):
        """Setup undetected Chrome with macOS-specific fixes"""
        try:
            # Chrome options for stealth
            options = uc.ChromeOptions()
            
            # macOS-specific fixes
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-gpu')
            options.add_argument('--disable-software-rasterizer')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_argument('--lang=sv-SE')
            options.add_argument('--window-size=1920,1080')
            
            # Proxy configuration if provided
            if self.use_proxy and self.proxy_config:
                proxy_string = f"{self.proxy_config['host']}:{self.proxy_config['port']}"
                options.add_argument(f'--proxy-server={proxy_string}')
                logger.info(f"Using proxy: {proxy_string}")
            
            # macOS-specific Chrome path detection
            import platform
            if platform.system() == "Darwin":  # macOS
                chrome_paths = [
                    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
                    "/Applications/Chromium.app/Contents/MacOS/Chromium"
                ]
                
                for chrome_path in chrome_paths:
                    import os
                    if os.path.exists(chrome_path):
                        options.binary_location = chrome_path
                        logger.info(f"Using Chrome at: {chrome_path}")
                        break
            
            # Create driver
            try:
                self.driver = uc.Chrome(options=options, version_main=119)
            except:
                try:
                    self.driver = uc.Chrome(options=options, version_main=None)
                except:
                    logger.warning("Falling back to regular Selenium WebDriver")
                    from selenium.webdriver.chrome.service import Service
                    service = Service()
                    self.driver = webdriver.Chrome(service=service, options=options)
            
            logger.info("Stealth browser setup completed successfully")
            
        except Exception as e:
            logger.error(f"Failed to setup stealth browser: {e}")
            raise
    
    def safe_get(self, url, max_retries=3):
        """Safely navigate to URL with retries"""
        for attempt in range(max_retries):
            try:
                logger.info(f"Navigating to: {url} (attempt {attempt + 1})")
                self.driver.get(url)
                
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                
                # Random scroll
                self.driver.execute_script(f"window.scrollTo(0, {random.randint(100, 500)});")
                time.sleep(random.uniform(1, 3))
                
                return True
                
            except Exception as e:
                logger.warning(f"Navigation attempt {attempt + 1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(random.uniform(2, 5))
        
        return False
    
    def extract_text_safely(self, element):
        """Safely extract text from element"""
        try:
            return element.text.strip() if element else ""
        except:
            return ""
    
    def close(self):
        """Close browser safely"""
        try:
            if self.driver:
                self.driver.quit()
        except:
            pass

class SwedishAcupuncturistScraper:
    """Main scraper class using stealth browser"""
    
    def __init__(self, use_proxy=False, proxy_config=None):
        self.browser = StealthBrowser(use_proxy, proxy_config)
        self.driver = self.browser.driver
        self.results = []
    
    def extract_contact_info(self, text):
        """Extract email and phone from text"""
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        phone_pattern = r'\b(?:\+46|0)(?:\s*-?\s*)?(?:\d{1,4}(?:\s*-?\s*)?){2,4}\b'
        
        email = re.search(email_pattern, text)
        phone = re.search(phone_pattern, text)
        
        return (email.group(0) if email else "", phone.group(0) if phone else "")
    
    def scrape_hitta_se(self):
        """Scrape Hitta.se for acupuncturists"""
        logger.info("Starting Hitta.se scraping...")
        
        search_terms = ["akupunktur"]
        
        for term in search_terms:
            try:
                search_url = f"https://www.hitta.se/sök/{term}"
                if not self.browser.safe_get(search_url):
                    continue
                
                time.sleep(random.uniform(3, 5))
                
                # Find business listings
                try:
                    listings = self.driver.find_elements(By.CSS_SELECTOR, 
                        "div[data-testid*='search-result'], .search-result, .hit-item, article")
                    
                    logger.info(f"Found {len(listings)} listings for '{term}'")
                    
                    for listing in listings[:10]:
                        try:
                            data = AcupuncturistData()
                            data.source_url = search_url
                            
                            # Extract business name
                            name_selectors = ["h2", "h3", "[data-testid*='name']", ".company-name"]
                            for selector in name_selectors:
                                try:
                                    name_elem = listing.find_element(By.CSS_SELECTOR, selector)
                                    if name_elem:
                                        data.business_name = self.browser.extract_text_safely(name_elem)
                                        break
                                except:
                                    continue
                            
                            # Extract all text for contact info
                            all_text = self.browser.extract_text_safely(listing)
                            email, phone_from_text = self.extract_contact_info(all_text)
                            data.email = email
                            data.phone = phone_from_text
                            
                            if data.business_name and 'akupunktur' in data.business_name.lower():
                                self.results.append(data)
                                logger.info(f"Added: {data.business_name}")
                            
                            time.sleep(random.uniform(0.5, 1.5))
                            
                        except Exception as e:
                            logger.debug(f"Error extracting listing data: {e}")
                            continue
                
                except Exception as e:
                    logger.error(f"Error finding listings on Hitta.se: {e}")
                
            except Exception as e:
                logger.error(f"Error scraping Hitta.se for '{term}': {e}")
        
        logger.info(f"Completed Hitta.se scraping. Found {len(self.results)} results.")
    
    def run_all_scrapers(self):
        """Run all scrapers in sequence"""
        logger.info("Starting comprehensive acupuncturist scraping...")
        
        try:
            self.scrape_hitta_se()
        except Exception as e:
            logger.error(f"Error in Hitta.se scraper: {e}")
        
        logger.info(f"Scraping completed! Total results: {len(self.results)}")
    
    def export_results(self, filename="swedish_acupuncturists_stealth.xlsx"):
        """Export results to Excel"""
        if not self.results:
            logger.warning("No results to export!")
            return None
        
        df = pd.DataFrame([result.to_dict() for result in self.results])
        df.to_excel(filename, index=False)
        logger.info(f"Exported {len(df)} results to {filename}")
        
        return df
    
    def close(self):
        """Close browser"""
        self.browser.close()

# Configuration for proxy (optional)
PROXY_CONFIG = {
    'host': 'proxy.example.com',
    'port': '8080',
    'username': 'your_username',
    'password': 'your_password'
}

def quick_fallback_scrape(search_category="akupunktur"):
    """Enhanced fallback scraping for any healthcare category"""
    print(f"\n🔄 Enhanced Fallback Scraper for {search_category.title()}")
    print("=" * 60)
    
    try:
        scraper = SimpleFallbackScraper(search_category)
        
        # Try multiple approaches
        print("📍 Step 1: Adding demonstration data...")
        scraper.scrape_manual_list()
        
        print("📍 Step 2: Searching Hitta.se...")
        scraper.scrape_enhanced_hitta()
        
        print("📍 Step 3: Trying Swedish business directories...")
        scraper.scrape_enhanced_directories()
        
        print("📍 Step 4: Simplified search...")
        scraper.scrape_google_search()
        
        # Remove duplicates based on business name and full name
        unique_results = []
        seen_identifiers = set()
        
        for result in scraper.results:
            # Create identifier from available names
            identifier = (result.business_name.lower().strip(), result.full_name.lower().strip())
            if identifier not in seen_identifiers and (result.business_name or result.full_name):
                seen_identifiers.add(identifier)
                unique_results.append(result)
        
        scraper.results = unique_results
        
        if scraper.results:
            # Export with category in filename
            filename = f"{search_category}_practitioners.xlsx"
            df = scraper.export_results(filename)
            
            print(f"\n✅ Enhanced scraper collected {len(scraper.results)} {search_category} entries!")
            
            print("\n📊 Sample results:")
            print("-" * 70)
            print(f"{'#':>2} {'Name/Business':<25} {'Location':<12} {'Phone':<12} {'Email':<15}")
            print("-" * 70)
            
            for i, result in enumerate(scraper.results[:8], 1):
                name = result.full_name or result.business_name or "N/A"
                name = name[:24] if len(name) > 24 else name
                location = result.location[:11] if result.location and len(result.location) > 11 else (result.location or "N/A")
                phone = result.phone[:11] if result.phone and len(result.phone) > 11 else (result.phone or "N/A")
                email = result.email[:14] if result.email and len(result.email) > 14 else (result.email or "N/A")
                
                print(f"{i:2d} {name:<25} {location:<12} {phone:<12} {email:<15}")
            
            if len(scraper.results) > 8:
                print(f"    ... and {len(scraper.results) - 8} more entries")
            
            print(f"\n📁 Data exported to: {filename}")
            
            # Show comprehensive statistics
            stats = {
                'Total entries': len(scraper.results),
                'With full names': len([r for r in scraper.results if r.full_name]),
                'With business names': len([r for r in scraper.results if r.business_name]),
                'With phone numbers': len([r for r in scraper.results if r.phone]),
                'With emails': len([r for r in scraper.results if r.email]),
                'With websites': len([r for r in scraper.results if r.website]),
                'With locations': len([r for r in scraper.results if r.location]),
            }
            
            print("\n📈 Collection Statistics:")
            for key, value in stats.items():
                percentage = f"({value/len(scraper.results)*100:.0f}%)" if len(scraper.results) > 0 else ""
                print(f"   {key}: {value} {percentage}")
            
            return df
        else:
            print(f"❌ No {search_category} practitioners found")
            print("\n💡 Try:")
            print("   - Different search category")
            print("   - Manual data collection")
            print("   - Professional association directories")
            
            return None
            
    except Exception as e:
        print(f"❌ Enhanced scraper failed: {e}")
        return None

def select_healthcare_category():
    """Let user select healthcare category"""
    categories = {
        '1': ('akupunktur', 'Akupunkturörer / Acupuncturists'),
        '2': ('osteopath', 'Osteopater / Osteopaths'),
        '3': ('naprapath', 'Naprapater / Naprapaths'),
        '4': ('kiropraktor', 'Kiropraktorer / Chiropractors'),
        '5': ('fysioterapeut', 'Fysioterapeuter / Physiotherapists'),
        '6': ('massageterapeut', 'Massageterapeuter / Massage Therapists'),
        '7': ('homeopat', 'Homeopater / Homeopaths'),
        '8': ('naturterapeut', 'Naturterapeuter / Naturopaths'),
        '9': ('custom', 'Custom search term')
    }
    
    print("\n🏥 Select Healthcare Category:")
    print("=" * 40)
    for key, (category, description) in categories.items():
        print(f"{key}. {description}")
    
    while True:
        choice = input(f"\nChoose category (1-9) [default: 1]: ").strip()
        
        if not choice:
            choice = '1'
        
        if choice in categories:
            if choice == '9':
                custom_term = input("Enter custom search term (Swedish): ").strip()
                if custom_term:
                    return custom_term.lower()
                else:
                    continue
            else:
                return categories[choice][0]
        else:
            print("❌ Invalid choice. Please select 1-9.")

def main():
    """Enhanced main function with category selection"""
    print("🇸🇪 Swedish Healthcare Practitioner Scraper")
    print("============================================")
    
    # First, let user select category
    search_category = select_healthcare_category()
    print(f"\n🎯 Selected category: {search_category.title()}")
    
    print("\nScraping Options:")
    print("1. Enhanced Fallback Scraper (Recommended) ⭐")
    print("2. Chrome-based Stealth Mode (May have driver issues)")
    print("3. Test Multiple Approaches")
    
    choice = input("\nChoose option (1-3) [default: 1]: ").strip()
    
    # Default to fallback scraper
    if not choice or choice == "1":
        df = quick_fallback_scrape(search_category)
        
        if df is not None:
            print(f"\n🎉 Success! Check the Excel file for {len(df)} {search_category} practitioners.")
            
            # Ask if user wants to search another category
            another = input(f"\n🔄 Search for another healthcare category? (y/n): ").strip().lower()
            if another.startswith('y'):
                print("\n" + "="*50)
                main()  # Recursive call for new category
        
        return
    
    elif choice == "2":
        print(f"🚀 Starting Chrome-based scraping for {search_category}...")
        # Note: Would need to update Chrome scraper to use search_category
        print("⚠️  Chrome scraper needs category integration - using fallback instead")
        quick_fallback_scrape(search_category)
        return
        
    elif choice == "3":
        print(f"🧪 Testing multiple approaches for {search_category}...")
        df = quick_fallback_scrape(search_category)
        return
    
    else:
        print("❌ Invalid choice, using fallback scraper")
        quick_fallback_scrape(search_category)

if __name__ == "__main__":
    main()

def test_both_approaches():
    """Test fallback scraper with different categories"""
    print("🧪 Testing Enhanced Scraper with Multiple Categories")
    print("=" * 50)
    
    test_categories = ['akupunktur', 'osteopath']
    
    for category in test_categories:
        print(f"\n📍 Testing {category}...")
        df = quick_fallback_scrape(category)
        
        if df is not None and len(df) > 0:
            print(f"✅ {category}: {len(df)} entries collected")
        else:
            print(f"❌ {category}: No entries found")
    
    return False  # Always use fallback approach

        # Updated methods now used in main() - old method calls removed

if __name__ == "__main__":
    main()

🇸🇪 Swedish Healthcare Practitioner Scraper

🏥 Select Healthcare Category:
1. Akupunkturörer / Acupuncturists
2. Osteopater / Osteopaths
3. Naprapater / Naprapaths
4. Kiropraktorer / Chiropractors
5. Fysioterapeuter / Physiotherapists
6. Massageterapeuter / Massage Therapists
7. Homeopater / Homeopaths
8. Naturterapeuter / Naturopaths
9. Custom search term

🎯 Selected category: Akupunktur

Scraping Options:
1. Enhanced Fallback Scraper (Recommended) ⭐
2. Chrome-based Stealth Mode (May have driver issues)
3. Test Multiple Approaches


2025-06-05 22:16:28,198 - INFO - Skipping demonstration data - collecting real practitioners only



🔄 Enhanced Fallback Scraper for Akupunktur
📍 Step 1: Adding demonstration data...
📍 Step 2: Searching Hitta.se...
   Searching Hitta.se for: akupunktur
   Searching Hitta.se for: akupunktör
   Searching Hitta.se for: traditionell kinesisk medicin


2025-06-05 22:16:35,462 - INFO - Hitta.se real data collection: 0 entries for akupunktur


📍 Step 3: Trying Swedish business directories...
   Trying directory: www.merinfo.se
   Trying directory: www.allabolag.se
   Trying directory: www.ratsit.se
📍 Step 4: Simplified search...
   Google search: akupunktur stockholm


2025-06-05 22:16:54,923 - INFO - Google found reference for akupunktur in stockholm


   Google search: akupunktur göteborg


2025-06-05 22:17:00,179 - INFO - Google found reference for akupunktur in göteborg
2025-06-05 22:17:00,180 - INFO - Google search completed with 2 references
2025-06-05 22:17:00,211 - INFO - Exported 2 akupunktur practitioners to akupunktur_practitioners.xlsx



✅ Enhanced scraper collected 2 akupunktur entries!

📊 Sample results:
----------------------------------------------------------------------
 # Name/Business             Location     Phone        Email          
----------------------------------------------------------------------
 1 Akupunktur Stockholm      Stockholm    N/A          N/A            
 2 Akupunktur Göteborg       Göteborg     N/A          N/A            

📁 Data exported to: akupunktur_practitioners.xlsx

📈 Collection Statistics:
   Total entries: 2 (100%)
   With full names: 0 (0%)
   With business names: 2 (100%)
   With phone numbers: 0 (0%)
   With emails: 0 (0%)
   With websites: 0 (0%)
   With locations: 2 (100%)

🎉 Success! Check the Excel file for 2 akupunktur practitioners.
🇸🇪 Swedish Healthcare Practitioner Scraper

🏥 Select Healthcare Category:
1. Akupunkturörer / Acupuncturists
2. Osteopater / Osteopaths
3. Naprapater / Naprapaths
4. Kiropraktorer / Chiropractors
5. Fysioterapeuter / Physiotherapists
6. Ma

2025-06-05 22:17:29,186 - INFO - Skipping demonstration data - collecting real practitioners only



🔄 Enhanced Fallback Scraper for Akupunktur
📍 Step 1: Adding demonstration data...
📍 Step 2: Searching Hitta.se...
   Searching Hitta.se for: akupunktur
   Searching Hitta.se for: akupunktör
   Searching Hitta.se for: traditionell kinesisk medicin


2025-06-05 22:17:36,455 - INFO - Hitta.se real data collection: 0 entries for akupunktur


📍 Step 3: Trying Swedish business directories...
   Trying directory: www.merinfo.se
   Trying directory: www.allabolag.se
   Trying directory: www.ratsit.se
📍 Step 4: Simplified search...
   Google search: akupunktur stockholm


2025-06-05 22:18:00,878 - INFO - Google found reference for akupunktur in stockholm
2025-06-05 22:18:06,077 - INFO - Google found reference for akupunktur in göteborg
2025-06-05 22:18:06,078 - INFO - Google search completed with 2 references


   Google search: akupunktur göteborg


2025-06-05 22:18:06,116 - INFO - Exported 2 akupunktur practitioners to akupunktur_practitioners.xlsx



✅ Enhanced scraper collected 2 akupunktur entries!

📊 Sample results:
----------------------------------------------------------------------
 # Name/Business             Location     Phone        Email          
----------------------------------------------------------------------
 1 Akupunktur Stockholm      Stockholm    N/A          N/A            
 2 Akupunktur Göteborg       Göteborg     N/A          N/A            

📁 Data exported to: akupunktur_practitioners.xlsx

📈 Collection Statistics:
   Total entries: 2 (100%)
   With full names: 0 (0%)
   With business names: 2 (100%)
   With phone numbers: 0 (0%)
   With emails: 0 (0%)
   With websites: 0 (0%)
   With locations: 2 (100%)

🎉 Success! Check the Excel file for 2 akupunktur practitioners.
