In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin, urlparse
import json
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Set
import logging
import hashlib
import random
# Removed fake_useragent due to Python 3.8 compatibility issue

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class AcupuncturistData:
    """Data structure for acupuncturist information"""
    full_name: str = ""
    business_name: str = ""
    website: str = ""
    email: str = ""
    phone: str = ""
    location: str = ""
    source_url: str = ""
    
    def to_dict(self) -> Dict:
        return asdict(self)
    
    def get_hash(self) -> str:
        """Generate unique hash for deduplication"""
        # Normalize data for comparison
        name = self.full_name.lower().strip()
        business = self.business_name.lower().strip()
        phone = re.sub(r'[^\d]', '', self.phone)
        email = self.email.lower().strip()
        
        # Create hash from key fields
        key = f"{name}|{business}|{phone}|{email}"
        return hashlib.md5(key.encode()).hexdigest()

class BaseScraper:
    """Base class for all scrapers - requests only version"""
    
    def __init__(self):
        self.session = requests.Session()
        
        # Static list of user agents (no need for fake_useragent library)
        user_agents = [
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
        ]
        user_agent = random.choice(user_agents)
        
        self.session.headers.update({
            'User-Agent': user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5,sv;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        })
    
    def get_page(self, url: str, wait_time: int = None) -> BeautifulSoup:
        """Get page content with enhanced anti-detection measures"""
        if wait_time is None:
            wait_time = random.uniform(2, 5)  # Longer delays to appear human
        time.sleep(wait_time)
        
        # Add cookies and session handling
        if not hasattr(self, '_session_initialized'):
            # Visit homepage first to get session cookies
            try:
                domain = urlparse(url).netloc
                homepage = f"https://{domain}"
                self.session.get(homepage, timeout=10)
                self._session_initialized = True
            except:
                pass
        
        # Enhanced headers to appear more legitimate
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'sv-SE,sv;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"macOS"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
        }
        
        try:
            response = self.session.get(url, timeout=15, headers=headers, allow_redirects=True)
            
            # Handle different status codes
            if response.status_code == 403:
                logger.warning(f"403 Forbidden for {url} - trying alternative approach")
                return BeautifulSoup("", 'html.parser')
            elif response.status_code == 429:
                logger.warning(f"Rate limited for {url} - waiting longer")
                time.sleep(10)
                return BeautifulSoup("", 'html.parser')
            
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
            
        except Exception as e:
            logger.error(f"Error fetching {url}: {e}")
            return BeautifulSoup("", 'html.parser')
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if not text:
            return ""
        return re.sub(r'\s+', ' ', text.strip())
    
    def extract_email(self, text: str) -> str:
        """Extract email from text"""
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        return emails[0] if emails else ""
    
    def extract_phone(self, text: str) -> str:
        """Extract Swedish phone number from text"""
        # Swedish phone patterns
        patterns = [
            r'\b(?:\+46|0)(?:\s*-?\s*)?(?:\d{1,4}(?:\s*-?\s*)?){2,4}\b',
            r'\b\d{2,4}[-\s]\d{2,4}[-\s]\d{2,4}\b',
            r'\b\d{10,11}\b'
        ]
        
        for pattern in patterns:
            phones = re.findall(pattern, text)
            if phones:
                return phones[0].strip()
        return ""
    
    def scrape(self) -> List[AcupuncturistData]:
        """Override this method in each scraper"""
        raise NotImplementedError

class HittaScraper(BaseScraper):
    """Scraper for Hitta.se (Swedish business directory)"""
    
    def __init__(self):
        super().__init__()
        self.base_url = "https://www.hitta.se"
    
    def scrape(self) -> List[AcupuncturistData]:
        results = []
        
        # Different search terms for comprehensive coverage
        search_terms = [
            "akupunktur",
            "akupunktör",
            "traditionell kinesisk medicin",
            "TCM"
        ]
        
        for term in search_terms:
            try:
                search_url = f"{self.base_url}/sök/{term}"
                soup = self.get_page(search_url)
                
                # Find business listings
                businesses = soup.find_all(['div', 'article'], class_=re.compile(r'hit|result|listing'))
                
                for business in businesses:
                    data = AcupuncturistData()
                    data.source_url = search_url
                    
                    # Extract business name
                    name_selectors = ['h2', 'h3', '.company-name', '.business-name', '.name']
                    for selector in name_selectors:
                        name_elem = business.find(selector)
                        if name_elem:
                            data.business_name = self.clean_text(name_elem.get_text())
                            break
                    
                    # Extract location/address
                    address_selectors = ['.address', '.location', '.locality']
                    for selector in address_selectors:
                        addr_elem = business.find(selector)
                        if addr_elem:
                            data.location = self.clean_text(addr_elem.get_text())
                            break
                    
                    # Extract contact info from all text
                    all_text = business.get_text()
                    data.email = self.extract_email(all_text)
                    data.phone = self.extract_phone(all_text)
                    
                    # Extract website
                    links = business.find_all('a', href=True)
                    for link in links:
                        href = link['href']
                        if any(domain in href for domain in ['.se', '.com', '.net', '.org']) and 'hitta.se' not in href:
                            data.website = href
                            break
                    
                    if data.business_name and ('akupunktur' in data.business_name.lower() or 'tcm' in data.business_name.lower()):
                        results.append(data)
                        
            except Exception as e:
                logger.error(f"Error scraping Hitta for {term}: {e}")
        
        return results

class YellowPagesScraper(BaseScraper):

    class EniroScraper(BaseScraper):
        """Scraper for Eniro.se (Swedish business directory)"""
        
        def __init__(self):
            super().__init__()
            self.base_url = "https://www.eniro.se"
        
        def scrape(self) -> List[AcupuncturistData]:
            results = []
            
            search_terms = ["akupunktur", "akupunktör", "traditionell kinesisk medicin"]
            
            for term in search_terms:
                try:
                    search_url = f"{self.base_url}/sok/foretagsuppgifter?q={term}"
                    soup = self.get_page(search_url)
                    
                    # Find business listings
                    businesses = soup.find_all(['div', 'article'], class_=re.compile(r'company|hit|result'))
                    
                    for business in businesses:
                        data = AcupuncturistData()
                        data.source_url = search_url
                        
                        # Extract business name
                        name_elem = business.find(['h2', 'h3', '.company-name', '.name'])
                        if name_elem:
                            data.business_name = self.clean_text(name_elem.get_text())
                        
                        # Extract location
                        address_elem = business.find(['.address', '.location'])
                        if address_elem:
                            data.location = self.clean_text(address_elem.get_text())
                        
                        # Extract phone
                        phone_elem = business.find(['.phone', '.telephone'])
                        if phone_elem:
                            data.phone = self.clean_text(phone_elem.get_text())
                        
                        # Extract email and website from links
                        links = business.find_all('a', href=True)
                        for link in links:
                            href = link['href']
                            if 'mailto:' in href:
                                data.email = href.replace('mailto:', '')
                            elif any(domain in href for domain in ['.se', '.com', '.net']) and 'eniro.se' not in href:
                                data.website = href
                        
                        if data.business_name:
                            results.append(data)
                            
                except Exception as e:
                    logger.error(f"Error scraping Eniro for {term}: {e}")
            
            return results

class RatsitScraper(BaseScraper):
    """Scraper for Ratsit.se"""
    
    def __init__(self):
        super().__init__()
        self.base_url = "https://www.ratsit.se"
    
    def scrape(self) -> List[AcupuncturistData]:
        results = []
        
        try:
            search_url = f"{self.base_url}/sök?q=akupunktur"
            soup = self.get_page(search_url)
            
            # Find business/person listings
            listings = soup.find_all(['div', 'article'], class_=re.compile(r'result|hit|listing'))
            
            for listing in listings:
                data = AcupuncturistData()
                data.source_url = search_url
                
                # Extract name (could be person or business)
                name_elem = listing.find(['h2', 'h3', '.name', '.title'])
                if name_elem:
                    name_text = self.clean_text(name_elem.get_text())
                    # Determine if it's a person or business name
                    if any(word in name_text.lower() for word in ['ab', 'akupunktur', 'klinik', 'center']):
                        data.business_name = name_text
                    else:
                        data.full_name = name_text
                
                # Extract location
                location_elem = listing.find(['.address', '.location', '.area'])
                if location_elem:
                    data.location = self.clean_text(location_elem.get_text())
                
                # Extract contact info
                all_text = listing.get_text()
                data.email = self.extract_email(all_text)
                data.phone = self.extract_phone(all_text)
                
                if data.full_name or data.business_name:
                    results.append(data)
                    
        except Exception as e:
            logger.error(f"Error scraping Ratsit: {e}")
        
        return results

class HealthProfessionalsScraper(BaseScraper):
    """Scraper for healthcare professional directories"""
    
    def __init__(self):
        super().__init__()
    
    def scrape_bokadirekt(self) -> List[AcupuncturistData]:
        """Scrape BokaDirekt.se for acupuncturists"""
        results = []
        
        try:
            search_url = "https://www.bokadirekt.se/boka/akupunktur"
            soup = self.get_page(search_url)
            
            practitioners = soup.find_all(['div', 'article'], class_=re.compile(r'practitioner|provider|therapist'))
            
            for practitioner in practitioners:
                data = AcupuncturistData()
                data.source_url = search_url
                
                # Extract practitioner name
                name_elem = practitioner.find(['h2', 'h3', '.name', '.practitioner-name'])
                if name_elem:
                    data.full_name = self.clean_text(name_elem.get_text())
                
                # Extract clinic/business name
                clinic_elem = practitioner.find(['.clinic', '.business', '.practice'])
                if clinic_elem:
                    data.business_name = self.clean_text(clinic_elem.get_text())
                
                # Extract location
                location_elem = practitioner.find(['.location', '.address', '.area'])
                if location_elem:
                    data.location = self.clean_text(location_elem.get_text())
                
                # Extract contact info
                all_text = practitioner.get_text()
                data.email = self.extract_email(all_text)
                data.phone = self.extract_phone(all_text)
                
                # Extract website
                links = practitioner.find_all('a', href=True)
                for link in links:
                    href = link['href']
                    if any(domain in href for domain in ['.se', '.com']) and 'bokadirekt.se' not in href:
                        data.website = href
                        break
                
                if data.full_name or data.business_name:
                    results.append(data)
                    
        except Exception as e:
            logger.error(f"Error scraping BokaDirekt: {e}")
        
        return results
    
    def scrape_vardguiden(self) -> List[AcupuncturistData]:
        """Scrape 1177.se/Vårdguiden for acupuncturists"""
        results = []
        
        try:
            # 1177.se often has healthcare provider listings
            search_url = "https://www.1177.se/hitta-vard/?q=akupunktur"
            soup = self.get_page(search_url)
            
            providers = soup.find_all(['div', 'article'], class_=re.compile(r'provider|clinic|unit'))
            
            for provider in providers:
                data = AcupuncturistData()
                data.source_url = search_url
                
                # Extract clinic name
                name_elem = provider.find(['h2', 'h3', '.name', '.unit-name'])
                if name_elem:
                    data.business_name = self.clean_text(name_elem.get_text())
                
                # Extract location
                address_elem = provider.find(['.address', '.location'])
                if address_elem:
                    data.location = self.clean_text(address_elem.get_text())
                
                # Extract contact info
                contact_elem = provider.find(['.contact', '.phone'])
                if contact_elem:
                    contact_text = contact_elem.get_text()
                    data.phone = self.extract_phone(contact_text)
                    data.email = self.extract_email(contact_text)
                
                if data.business_name and 'akupunktur' in data.business_name.lower():
                    results.append(data)
                    
        except Exception as e:
            logger.error(f"Error scraping Vårdguiden: {e}")
        
        return results
    
    def scrape(self) -> List[AcupuncturistData]:
        """Run all healthcare directory scrapers"""
        results = []
        results.extend(self.scrape_bokadirekt())
        results.extend(self.scrape_vardguiden())
        return results

class GoogleSearchScraper(BaseScraper):
    """Scraper for Google search results - more accessible"""
    
    def __init__(self):
        super().__init__()
    
    def scrape(self) -> List[AcupuncturistData]:
        results = []
        
        # Search terms for Swedish acupuncturists
        search_queries = [
            "akupunktur stockholm site:se",
            "akupunktör göteborg site:se", 
            "traditionell kinesisk medicin malmö site:se",
            "akupunktur uppsala site:se"
        ]
        
        for query in search_queries:
            try:
                # Use Google search (be careful not to abuse)
                search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}&num=20"
                soup = self.get_page(search_url, wait_time=random.uniform(3, 6))
                
                # Find search result links
                results_divs = soup.find_all('div', class_='g')
                
                for result_div in results_divs:
                    try:
                        # Extract link
                        link_elem = result_div.find('a', href=True)
                        if not link_elem:
                            continue
                            
                        website = link_elem['href']
                        if not website.startswith('http'):
                            continue
                        
                        # Extract title (potential business name)
                        title_elem = result_div.find('h3')
                        if not title_elem:
                            continue
                            
                        title = self.clean_text(title_elem.get_text())
                        
                        # Extract snippet for contact info
                        snippet_elem = result_div.find(['span', 'div'], class_=re.compile(r'st|s3v9rd'))
                        snippet_text = snippet_elem.get_text() if snippet_elem else ""
                        
                        data = AcupuncturistData()
                        data.source_url = search_url
                        data.business_name = title
                        data.website = website
                        data.email = self.extract_email(snippet_text)
                        data.phone = self.extract_phone(snippet_text)
                        
                        # Try to extract location from snippet
                        if any(city in snippet_text.lower() for city in ['stockholm', 'göteborg', 'malmö', 'uppsala']):
                            data.location = snippet_text[:100]  # First part likely contains location
                        
                        if data.business_name and 'akupunktur' in data.business_name.lower():
                            results.append(data)
                            
                    except Exception as e:
                        logger.debug(f"Error processing Google result: {e}")
                        continue
                        
            except Exception as e:
                logger.error(f"Error scraping Google for {query}: {e}")
        
        return results

class PublicDirectoryScraper(BaseScraper):
    """Scraper for more open/public directories"""
    
    def __init__(self):
        super().__init__()
    
    def scrape_merinfo(self) -> List[AcupuncturistData]:
        """Scrape Merinfo.se - often more accessible"""
        results = []
        
        try:
            search_url = "https://www.merinfo.se/search?q=akupunktur"
            soup = self.get_page(search_url)
            
            companies = soup.find_all(['div', 'article'], class_=re.compile(r'company|result'))
            
            for company in companies:
                data = AcupuncturistData()
                data.source_url = search_url
                
                # Extract company name
                name_elem = company.find(['h2', 'h3', '.company-name'])
                if name_elem:
                    data.business_name = self.clean_text(name_elem.get_text())
                
                # Extract location
                location_elem = company.find(['.address', '.location'])
                if location_elem:
                    data.location = self.clean_text(location_elem.get_text())
                
                # Extract all text for contact info
                all_text = company.get_text()
                data.email = self.extract_email(all_text)
                data.phone = self.extract_phone(all_text)
                
                if data.business_name:
                    results.append(data)
                    
        except Exception as e:
            logger.error(f"Error scraping Merinfo: {e}")
        
        return results
    
    def scrape_allabolag(self) -> List[AcupuncturistData]:
        """Scrape Allabolag.se for company information"""
        results = []
        
        try:
            search_url = "https://www.allabolag.se/what/akupunktur"
            soup = self.get_page(search_url)
            
            companies = soup.find_all(['div'], class_=re.compile(r'company|hit'))
            
            for company in companies:
                data = AcupuncturistData()
                data.source_url = search_url
                
                # Extract company name
                name_elem = company.find(['a', 'h3'], class_=re.compile(r'company|name'))
                if name_elem:
                    data.business_name = self.clean_text(name_elem.get_text())
                
                # Extract location from any address elements
                address_text = company.get_text()
                # Look for Swedish postal codes and cities
                postal_match = re.search(r'\d{3}\s?\d{2}\s+([A-ZÅÄÖ][a-zåäö]+)', address_text)
                if postal_match:
                    data.location = postal_match.group(0)
                
                if data.business_name and 'akupunktur' in data.business_name.lower():
                    results.append(data)
                    
        except Exception as e:
            logger.error(f"Error scraping Allabolag: {e}")
        
        return results
    
    def scrape(self) -> List[AcupuncturistData]:
        """Run all public directory scrapers"""
        results = []
        results.extend(self.scrape_merinfo())
        results.extend(self.scrape_allabolag())
        return results
    """Scraper for GulaS.se (Swedish Yellow Pages)"""
    
    def __init__(self):
        super().__init__()
        self.base_url = "https://www.gulasidorna.se"
    
    def scrape(self) -> List[AcupuncturistData]:
        results = []
        
        try:
            search_url = f"{self.base_url}/sok?q=akupunktur"
            soup = self.get_page(search_url)
            
            # Find business listings
            businesses = soup.find_all(['div', 'article'], class_=re.compile(r'company|business|listing'))
            
            for business in businesses:
                data = AcupuncturistData()
                data.source_url = search_url
                
                # Extract business name
                name_elem = business.find(['h2', 'h3', '.company-name'])
                if name_elem:
                    data.business_name = self.clean_text(name_elem.get_text())
                
                # Extract location
                address_elem = business.find(['.address', '.location'])
                if address_elem:
                    data.location = self.clean_text(address_elem.get_text())
                
                # Extract contact details
                all_text = business.get_text()
                data.email = self.extract_email(all_text)
                data.phone = self.extract_phone(all_text)
                
                # Extract website
                links = business.find_all('a', href=True)
                for link in links:
                    href = link['href']
                    if 'http' in href and 'gulasidorna.se' not in href:
                        data.website = href
                        break
                
                if data.business_name:
                    results.append(data)
                    
        except Exception as e:
            logger.error(f"Error scraping Yellow Pages: {e}")
        
        return results

class AcupuncturistDataCollector:
    """Main class to coordinate all scrapers and manage data"""
    
    def __init__(self):
        self.scrapers = [
            PublicDirectoryScraper(),  # More accessible
            GoogleSearchScraper(),     # Alternative approach
            HittaScraper(),
            # EniroScraper(),          # Often blocked, disabled for now
            RatsitScraper(),
            HealthProfessionalsScraper(),
            YellowPagesScraper(),
        ]
        self.all_data: List[AcupuncturistData] = []
        self.unique_hashes: Set[str] = set()
    
    def run_all_scrapers(self):
        """Run all scrapers and collect data"""
        logger.info("Starting data collection from all sources...")
        
        for scraper in self.scrapers:
            scraper_name = scraper.__class__.__name__
            logger.info(f"Running {scraper_name}...")
            
            try:
                data = scraper.scrape()
                logger.info(f"{scraper_name} collected {len(data)} entries")
                self.all_data.extend(data)
                
                # Add delay between scrapers
                time.sleep(2)
                
            except Exception as e:
                logger.error(f"Error in {scraper_name}: {e}")
    
    def deduplicate_data(self) -> List[AcupuncturistData]:
        """Remove duplicate entries based on hash"""
        unique_data = []
        
        for entry in self.all_data:
            entry_hash = entry.get_hash()
            if entry_hash not in self.unique_hashes:
                self.unique_hashes.add(entry_hash)
                unique_data.append(entry)
        
        logger.info(f"Removed {len(self.all_data) - len(unique_data)} duplicates")
        return unique_data
    
    def clean_and_validate_data(self, data: List[AcupuncturistData]) -> List[AcupuncturistData]:
        """Clean and validate collected data"""
        cleaned_data = []
        
        for entry in data:
            # Skip entries without essential information
            if not entry.full_name and not entry.business_name:
                continue
            
            # Clean phone numbers
            if entry.phone:
                entry.phone = re.sub(r'[^\d\+\-\s]', '', entry.phone)
                # Ensure Swedish phone format
                if entry.phone and not entry.phone.startswith(('+46', '0')):
                    entry.phone = f"0{entry.phone}" if len(entry.phone) >= 9 else entry.phone
            
            # Validate email
            if entry.email and '@' not in entry.email:
                entry.email = ""
            
            # Clean website URLs
            if entry.website:
                if not entry.website.startswith('http'):
                    entry.website = f"https://{entry.website}"
                # Remove tracking parameters
                entry.website = re.sub(r'\?.*$', '', entry.website)
            
            # Clean location
            if entry.location:
                entry.location = re.sub(r'\s+', ' ', entry.location)
            
            cleaned_data.append(entry)
        
        return cleaned_data
    
    def export_to_excel(self, filename: str = "acupuncturists_sweden.xlsx"):
        """Export data to Excel file"""
        unique_data = self.deduplicate_data()
        cleaned_data = self.clean_and_validate_data(unique_data)
        
        # Convert to DataFrame
        df = pd.DataFrame([entry.to_dict() for entry in cleaned_data])
        
        # Reorder columns
        column_order = ['full_name', 'business_name', 'website', 'email', 'phone', 'location', 'source_url']
        df = df.reindex(columns=column_order)
        
        # Export to Excel with formatting
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='Acupuncturists')
            
            # Auto-adjust column widths
            worksheet = writer.sheets['Acupuncturists']
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter
                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass
                adjusted_width = min(max_length + 2, 50)
                worksheet.column_dimensions[column_letter].width = adjusted_width
        
        logger.info(f"Exported {len(df)} unique entries to {filename}")
        return df
    
    def export_to_csv(self, filename: str = "acupuncturists_sweden.csv"):
        """Export data to CSV file"""
        unique_data = self.deduplicate_data()
        cleaned_data = self.clean_and_validate_data(unique_data)
        
        # Convert to DataFrame
        df = pd.DataFrame([entry.to_dict() for entry in cleaned_data])
        
        # Reorder columns
        column_order = ['full_name', 'business_name', 'website', 'email', 'phone', 'location', 'source_url']
        df = df.reindex(columns=column_order)
        
        # Export to CSV
        df.to_csv(filename, index=False, encoding='utf-8')
        logger.info(f"Exported {len(df)} unique entries to {filename}")
        
        return df
    
    def get_statistics(self):
        """Get collection statistics"""
        unique_data = self.deduplicate_data()
        
        stats = {
            'total_collected': len(self.all_data),
            'unique_entries': len(unique_data),
            'duplicates_removed': len(self.all_data) - len(unique_data),
            'entries_with_email': len([d for d in unique_data if d.email]),
            'entries_with_phone': len([d for d in unique_data if d.phone]),
            'entries_with_website': len([d for d in unique_data if d.website]),
            'business_names': len([d for d in unique_data if d.business_name]),
            'full_names': len([d for d in unique_data if d.full_name])
        }
        
        return stats

# Simple usage example
def quick_scrape():
    """Quick scraping function for testing"""
    collector = AcupuncturistDataCollector()
    
    # Run just one scraper for testing (use more accessible one)
    public_scraper = PublicDirectoryScraper()
    data = public_scraper.scrape()
    collector.all_data = data
    
    # If no data, try Google search as backup
    if len(data) == 0:
        logger.info("No data from public directories, trying Google search...")
        google_scraper = GoogleSearchScraper()
        backup_data = google_scraper.scrape()
        collector.all_data = backup_data
    
    # Export results
    df = collector.export_to_csv("test_acupuncturists.csv")
    stats = collector.get_statistics()
    
    print("Quick scrape results:")
    for key, value in stats.items():
        print(f"{key}: {value}")
    
    return df

# Main execution
if __name__ == "__main__":
    print("Swedish Acupuncturist Data Scraper")
    print("==================================")
    
    # Ask user for preference
    choice = input("Run full scraping (f) or quick test (q)? [f/q]: ").lower()
    
    if choice == 'q':
        df = quick_scrape()
    else:
        # Create collector and run all scrapers
        collector = AcupuncturistDataCollector()
        
        # Run collection
        collector.run_all_scrapers()
        
        # Export results
        df = collector.export_to_excel()
        collector.export_to_csv()
        
        # Show statistics
        stats = collector.get_statistics()
        
        print(f"\nCollection Summary:")
        print(f"==================")
        for key, value in stats.items():
            print(f"{key.replace('_', ' ').title()}: {value}")
        
        # Display sample data
        if len(df) > 0:
            print("\nSample data (first 3 rows):")
            print("===========================")
            pd.set_option('display.max_columns', None)
            pd.set_option('display.width', None)
            print(df.head(3).to_string(index=False))
        else:
            print("\nNo data collected. Please check the scrapers and try again.")

Swedish Acupuncturist Data Scraper


2025-06-05 17:23:58,472 - INFO - Starting data collection from all sources...
2025-06-05 17:23:58,473 - INFO - Running PublicDirectoryScraper...
2025-06-05 17:24:10,577 - ERROR - Error fetching https://www.gulasidorna.se/sok?q=akupunktur: Exceeded 30 redirects.
2025-06-05 17:24:10,578 - INFO - PublicDirectoryScraper collected 0 entries
2025-06-05 17:24:12,582 - INFO - Running GoogleSearchScraper...
2025-06-05 17:24:30,784 - INFO - GoogleSearchScraper collected 0 entries
2025-06-05 17:24:32,789 - INFO - Running HittaScraper...
2025-06-05 17:24:35,512 - ERROR - Error fetching https://www.hitta.se/sök/akupunktur: 404 Client Error: Not Found for url: https://www.hitta.se/s%C3%B6k/akupunktur
2025-06-05 17:24:38,393 - ERROR - Error fetching https://www.hitta.se/sök/akupunktör: 404 Client Error: Not Found for url: https://www.hitta.se/s%C3%B6k/akupunkt%C3%B6r
2025-06-05 17:24:42,313 - ERROR - Error fetching https://www.hitta.se/sök/traditionell kinesisk medicin: 404 Client Error: Not Found fo


Collection Summary:
Total Collected: 0
Unique Entries: 0
Duplicates Removed: 0
Entries With Email: 0
Entries With Phone: 0
Entries With Website: 0
Business Names: 0
Full Names: 0

No data collected. Please check the scrapers and try again.
