In [1]:
#!/usr/bin/env python3
"""
LinkedIn Profile Scraper via Google Search
Automated discovery of CEO/CFO LinkedIn profiles by city
"""

import json
import os
import random
import re
import time
from typing import Dict, List, Set, Optional
from urllib.parse import urljoin, urlparse
import logging
from dataclasses import dataclass, asdict

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException, 
    WebDriverException, 
    TimeoutException,
    ElementClickInterceptedException
)
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

@dataclass
class LinkedInProfile:
    """Data structure for LinkedIn profile information"""
    name: Optional[str]
    url: str
    city: Optional[str]
    company: Optional[str]
    title: Optional[str]
    scraped_at: str

class LinkedInScraper:
    """Main scraper class for LinkedIn profiles via Google Search"""
    
    def __init__(self, position: str = "ceo", city: str = "bangalore"):
        self.position = position.lower()
        self.city = city.lower()
        self.search_query = f"{position} {city} linkedin"
        self.max_pages = 30
        self.checkpoint_file = f"scraped_urls_{position}_{city}.json"
        self.output_file = f"linkedin_{position}_{city}_profiles.json"
        self.scraped_urls: Set[str] = set()
        self.profiles: List[LinkedInProfile] = []
        
        # User agents for rotation
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ]
    
    def get_stealth_driver(self) -> webdriver.Chrome:
        """Create a stealth Chrome driver to avoid detection"""
        options = Options()
        
        # Stealth settings
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-plugins")
        options.add_argument("--disable-images")  # Faster loading
        options.add_argument("--disable-javascript")  # For basic scraping
        options.add_argument(f"--user-agent={random.choice(self.user_agents)}")
        
        # Exclude automation switches
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        
        # Additional prefs
        prefs = {
            "profile.default_content_setting_values": {
                "notifications": 2,
                "media_stream": 2,
            }
        }
        options.add_experimental_option("prefs", prefs)
        
        try:
            driver = webdriver.Chrome(options=options)
            driver.set_page_load_timeout(30)
            driver.implicitly_wait(10)
            
            # Execute script to remove webdriver property
            driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            
            return driver
        except Exception as e:
            logger.error(f"Failed to create driver: {e}")
            raise
    
    def load_checkpoint(self) -> None:
        """Load previously scraped URLs from checkpoint file"""
        if os.path.exists(self.checkpoint_file):
            try:
                with open(self.checkpoint_file, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    self.scraped_urls = set(data.get("urls", []))
                    logger.info(f"Loaded {len(self.scraped_urls)} previously scraped URLs")
            except Exception as e:
                logger.warning(f"Failed to load checkpoint: {e}")
                self.scraped_urls = set()
    
    def save_checkpoint(self) -> None:
        """Save current progress to checkpoint file"""
        try:
            with open(self.checkpoint_file, "w", encoding="utf-8") as f:
                json.dump({
                    "urls": list(self.scraped_urls),
                    "total_profiles": len(self.profiles),
                    "last_updated": time.strftime("%Y-%m-%d %H:%M:%S")
                }, f, indent=2)
            logger.info(f"Checkpoint saved: {len(self.scraped_urls)} URLs")
        except Exception as e:
            logger.error(f"Failed to save checkpoint: {e}")
    
    def human_delay(self, min_delay: float = 2.0, max_delay: float = 5.0) -> None:
        """Simulate human-like delays"""
        delay = random.uniform(min_delay, max_delay)
        time.sleep(delay)
    
    def extract_linkedin_data(self, link_element, driver) -> Optional[LinkedInProfile]:
        """Extract LinkedIn profile data from search result element"""
        try:
            # Get URL
            url = link_element.get_attribute('href')
            if not url or 'linkedin.com/in/' not in url:
                return None
            
            # Clean URL
            url = url.split('&')[0].split('?')[0]
            
            # Skip if already scraped
            if url in self.scraped_urls:
                return None
            
            # Extract name from title/text
            name = None
            try:
                h3_element = link_element.find_element(By.TAG_NAME, "h3")
                name = h3_element.text.strip()
            except:
                pass
            
            # Extract additional info from description
            city, company, title = None, None, None
            try:
                parent = link_element.find_element(By.XPATH, "./..")
                description_elements = parent.find_elements(By.CSS_SELECTOR, "span, div")
                
                for elem in description_elements:
                    text = elem.text.strip()
                    if text and len(text) > 3:
                        # Try to identify city, company, or title
                        if any(word in text.lower() for word in [self.city, 'ceo', 'cfo', 'president', 'director']):
                            if not title and any(word in text.lower() for word in ['ceo', 'cfo', 'president', 'director']):
                                title = text
                            elif not city and self.city.lower() in text.lower():
                                city = text
                            elif not company and len(text.split()) <= 4:
                                company = text
            except:
                pass
            
            profile = LinkedInProfile(
                name=name,
                url=url,
                city=city or self.city,
                company=company,
                title=title or self.position.upper(),
                scraped_at=time.strftime("%Y-%m-%d %H:%M:%S")
            )
            
            self.scraped_urls.add(url)
            return profile
            
        except Exception as e:
            logger.warning(f"Failed to extract profile data: {e}")
            return None
    
    def scrape_google_page(self, driver) -> List[LinkedInProfile]:
        """Scrape LinkedIn profiles from current Google results page"""
        profiles = []
        
        try:
            # Wait for results to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div#search"))
            )
            
            # Find LinkedIn links
            linkedin_links = driver.find_elements(By.XPATH, "//a[contains(@href, 'linkedin.com/in/')]")
            logger.info(f"Found {len(linkedin_links)} LinkedIn links on page")
            
            for link in linkedin_links:
                try:
                    profile = self.extract_linkedin_data(link, driver)
                    if profile:
                        profiles.append(profile)
                        logger.info(f"Scraped: {profile.name} - {profile.url}")
                except Exception as e:
                    logger.warning(f"Error processing link: {e}")
                    continue
            
        except TimeoutException:
            logger.warning("Timeout waiting for search results")
        except Exception as e:
            logger.error(f"Error scraping page: {e}")
        
        return profiles
    
    def navigate_to_next_page(self, driver) -> bool:
        """Navigate to next Google results page"""
        try:
            # Look for next button
            next_selectors = [
                "a#pnnext",
                "a[aria-label='Next']",
                "a[aria-label*='Next']",
                "//a[contains(text(), 'Next')]"
            ]
            
            for selector in next_selectors:
                try:
                    if selector.startswith("//"):
                        next_button = driver.find_element(By.XPATH, selector)
                    else:
                        next_button = driver.find_element(By.CSS_SELECTOR, selector)
                    
                    if next_button.is_enabled():
                        driver.execute_script("arguments[0].click();", next_button)
                        self.human_delay(3, 6)
                        return True
                except:
                    continue
            
            return False
            
        except Exception as e:
            logger.warning(f"Failed to navigate to next page: {e}")
            return False
    
    def run_scraping(self) -> List[LinkedInProfile]:
        """Main scraping execution"""
        logger.info(f"Starting scrape for: {self.search_query}")
        
        # Load previous progress
        self.load_checkpoint()
        
        driver = None
        try:
            driver = self.get_stealth_driver()
            
            # Go to Google
            driver.get("https://www.google.com")
            self.human_delay()
            
            # Perform search
            search_box = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.NAME, "q"))
            )
            search_box.clear()
            search_box.send_keys(self.search_query)
            search_box.send_keys(Keys.RETURN)
            
            self.human_delay(2, 4)
            
            # Scrape pages
            page_count = 0
            while page_count < self.max_pages:
                logger.info(f"Scraping page {page_count + 1}")
                
                # Scrape current page
                page_profiles = self.scrape_google_page(driver)
                self.profiles.extend(page_profiles)
                
                # Save progress
                if page_count % 5 == 0:  # Save every 5 pages
                    self.save_checkpoint()
                    self.save_results()
                
                # Navigate to next page
                if not self.navigate_to_next_page(driver):
                    logger.info("No more pages available")
                    break
                
                page_count += 1
                
                # Longer delay every few pages to avoid detection
                if page_count % 10 == 0:
                    logger.info("Taking extended break...")
                    self.human_delay(30, 60)
            
        except KeyboardInterrupt:
            logger.info("Scraping interrupted by user")
        except Exception as e:
            logger.error(f"Scraping failed: {e}")
        finally:
            if driver:
                driver.quit()
            
            # Final save
            self.save_checkpoint()
            self.save_results()
        
        return self.profiles
    
    def save_results(self) -> None:
        """Save scraped profiles to JSON file"""
        try:
            profiles_data = [asdict(profile) for profile in self.profiles]
            
            with open(self.output_file, "w", encoding="utf-8") as f:
                json.dump({
                    "search_query": self.search_query,
                    "total_profiles": len(profiles_data),
                    "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"),
                    "profiles": profiles_data
                }, f, indent=2, ensure_ascii=False)
            
            logger.info(f"Results saved to {self.output_file}: {len(profiles_data)} profiles")
            
        except Exception as e:
            logger.error(f"Failed to save results: {e}")
    
    def print_summary(self) -> None:
        """Print scraping summary"""
        print(f"\n{'='*50}")
        print(f"SCRAPING SUMMARY")
        print(f"{'='*50}")
        print(f"Search Query: {self.search_query}")
        print(f"Total Profiles Found: {len(self.profiles)}")
        print(f"Unique URLs Scraped: {len(self.scraped_urls)}")
        print(f"Output File: {self.output_file}")
        print(f"Checkpoint File: {self.checkpoint_file}")
        
        if self.profiles:
            print(f"\nSample Profiles:")
            for i, profile in enumerate(self.profiles[:5]):
                print(f"{i+1}. {profile.name} - {profile.company} ({profile.city})")
                print(f"   URL: {profile.url}")
        
        print(f"{'='*50}\n")

def main():
    """Main execution function"""
    # Configuration
    POSITION = "ceo"  # Change as needed: ceo, cfo, president, etc.
    CITY = "bangalore"  # Change as needed: bangalore, mumbai, delhi, etc.
    
    # Create and run scraper
    scraper = LinkedInScraper(position=POSITION, city=CITY)
    
    try:
        profiles = scraper.run_scraping()
        scraper.print_summary()
        
        if profiles:
            print(f"✅ Successfully scraped {len(profiles)} LinkedIn profiles!")
            print(f"📁 Results saved to: {scraper.output_file}")
        else:
            print("❌ No profiles found. Check your search parameters or try again later.")
            
    except Exception as e:
        logger.error(f"Scraping failed: {e}")
        print(f"❌ Scraping failed: {e}")

if __name__ == "__main__":
    main()

2025-05-25 11:19:39,203 - INFO - Starting scrape for: ceo bangalore linkedin
2025-05-25 11:19:51,193 - INFO - Scraping page 1
2025-05-25 11:19:51,263 - INFO - Found 11 LinkedIn links on page
2025-05-25 11:20:01,828 - INFO - Scraped: None - https://in.linkedin.com/in/anandsriganesh#:~:text=Anand%20Sri%20Ganesh%20%2D%20Bengaluru%2C%20Karnataka,%2C%20India%20%7C%20Professional%20Profile%20%7C%20LinkedIn
2025-05-25 11:20:12,112 - INFO - Scraped: None - https://in.linkedin.com/in/mohammed-imran-435ba0187#:~:text=Mohammed%20Imran%20%2D%20Ceo%20
2025-05-25 11:20:25,518 - INFO - Scraped: None - https://in.linkedin.com/in/shivku#:~:text=Shivakumar%20Ganesan%20%2D%20Bengaluru%2C%20Karnataka%2C%20India%20%7C%20Professional%20Profile%20%7C%20LinkedIn
2025-05-25 11:20:26,193 - INFO - Scraped: Nitin Gupta - Bengaluru, Karnataka, India - https://in.linkedin.com/in/nitinguptaprofile
2025-05-25 11:20:26,707 - INFO - Scraped: Sharan Hegde - Founder & CEO - 1% Club - https://in.linkedin.com/in/sharanhegd


SCRAPING SUMMARY
Search Query: ceo bangalore linkedin
Total Profiles Found: 99
Unique URLs Scraped: 99
Output File: linkedin_ceo_bangalore_profiles.json
Checkpoint File: scraped_urls_ceo_bangalore.json

Sample Profiles:
1. None - None (bangalore)
   URL: https://in.linkedin.com/in/anandsriganesh#:~:text=Anand%20Sri%20Ganesh%20%2D%20Bengaluru%2C%20Karnataka,%2C%20India%20%7C%20Professional%20Profile%20%7C%20LinkedIn
2. None - None (bangalore)
   URL: https://in.linkedin.com/in/mohammed-imran-435ba0187#:~:text=Mohammed%20Imran%20%2D%20Ceo%20
3. None - None (bangalore)
   URL: https://in.linkedin.com/in/shivku#:~:text=Shivakumar%20Ganesan%20%2D%20Bengaluru%2C%20Karnataka%2C%20India%20%7C%20Professional%20Profile%20%7C%20LinkedIn
4. Nitin Gupta - Bengaluru, Karnataka, India - None (bangalore)
   URL: https://in.linkedin.com/in/nitinguptaprofile
5. Sharan Hegde - Founder & CEO - 1% Club - None (bangalore)
   URL: https://in.linkedin.com/in/sharanhegde95

✅ Successfully scraped 99 LinkedIn 