In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from functools import wraps
import time
import logging
import json
import csv
import sys

# ------------------------ Retry Decorator ------------------------
def retry_request(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        retries = 3
        for i in range(retries):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                logging.error(f"Attempt {i+1} failed with error: {e}")
                if i < retries - 1:
                    time.sleep(2)  # Wait before retrying
                else:
                    logging.error(f"All {retries} attempts failed.")
                    raise
    return wrapper

# ------------------------ Scraper Base Class ------------------------
class Scraper:
    def __init__(self, url, custom_selector=None):
        self.url = url
        self.data = []
        self.driver = None
        self.custom_selector = custom_selector

    @retry_request
    def fetch_data(self):
        try:
            logging.info(f"Started scraping from {self.url}")
            # Set up Chrome options for headless mode
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0")

            # Initialize WebDriver
            self.driver = webdriver.Chrome(
                service=Service(ChromeDriverManager().install()),
                options=chrome_options
            )
            self.driver.get(self.url)  # Open the page

            # Wait for dynamic content to load
            try:
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                logging.info("Page body loaded successfully.")
            except Exception as e:
                logging.warning(f"Dynamic content wait failed: {e}")

            soup = BeautifulSoup(self.driver.page_source, 'lxml')
            return self.parse_data(soup)
        except Exception as e:
            logging.error(f"Error fetching data from {self.url}: {e}")
            raise
        finally:
            if self.driver:
                self.driver.quit()  # Ensure browser is closed

    def parse_data(self, soup):
        # This method will be overridden in subclass
        pass

    def save_data(self, filename='scraped_data.json'):
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(self.data, f, indent=4, ensure_ascii=False)
            logging.info(f"Data saved to {filename}")
        except Exception as e:
            logging.error(f"Error saving JSON to {filename}: {e}")

    def to_csv(self, filename='scraped_data.csv'):
        try:
            if not self.data:
                logging.warning("No data to save to CSV.")
                return
            keys = self.data[0].keys()
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=keys)
                writer.writeheader()
                writer.writerows(self.data)
            logging.info(f"Data saved to {filename}")
        except Exception as e:
            logging.error(f"Error saving CSV to {filename}: {e}")

    def log_data(self):
        logging.info(f"Scraped Data: {self.data}")
        if not self.data:
            logging.warning("No data was scraped. Check selectors or site structure.")

# ------------------------ News Scraper ------------------------
class NewsScraper(Scraper):
    def parse_data(self, soup):
        self.data = []
        selectors = [
            'article', 'div.article', 'div.news-item', 'div.post', 'div.story',
            'h1', 'h2', 'h3', 'a.headline', 'div.title'
        ]
        if self.custom_selector:
            selectors.insert(0, self.custom_selector)

        for selector in selectors:
            elements = soup.select(selector)
            logging.info(f"Trying selector '{selector}': found {len(elements)} elements.")
            for element in elements:
                text = element.get_text(strip=True)
                if text:
                    self.data.append({'headline': text})
            if self.data:  # Stop if data is found
                break

        logging.info(f"Found {len(self.data)} news articles.")
        return self.data

# ------------------------ Product Scraper ------------------------
class ProductScraper(Scraper):
    def parse_data(self, soup):
        self.data = []
        selectors = [
            'div.product-item', 'div.product', 'div.item', 'li.product',
            'div.product-title', 'h2', 'h3', 'a.product-name'
        ]
        if self.custom_selector:
            selectors.insert(0, self.custom_selector)

        for selector in selectors:
            elements = soup.select(selector)
            logging.info(f"Trying selector '{selector}': found {len(elements)} elements.")
            for element in elements:
                text = element.get_text(strip=True)
                if text:
                    self.data.append({'product_name': text})
            if self.data:  # Stop if data is found
                break

        logging.info(f"Found {len(self.data)} products.")
        return self.data

# ------------------------ Social Scraper ------------------------
class SocialScraper(Scraper):
    def parse_data(self, soup):
        self.data = []
        selectors = [
            'div.post', 'div.tweet', 'div.status', 'article', 'div.comment',
            'p', 'div.content', 'span.text'
        ]
        if self.custom_selector:
            selectors.insert(0, self.custom_selector)

        for selector in selectors:
            elements = soup.select(selector)
            logging.info(f"Trying selector '{selector}': found {len(elements)} elements.")
            for element in elements:
                text = element.get_text(strip=True)
                if text:
                    self.data.append({'post': text})
            if self.data:  # Stop if data is found
                break

        logging.info(f"Found {len(self.data)} social posts.")
        return self.data

# ------------------------ Data Cleaner ------------------------
class DataCleaner:
    @staticmethod
    def clean_text(text):
        if not text:
            return ''
        return ' '.join(text.strip().split())

# ------------------------ Lead Generator ------------------------
class LeadGenerator:
    def __init__(self, scraper):
        self.scraper = scraper

    def generate_leads(self):
        key = list(self.scraper.data[0].keys())[0] if self.scraper.data else 'item'
        leads = [{'lead': item[key]} for item in self.scraper.data]
        logging.info(f"Generated {len(leads)} leads.")
        return leads

# ------------------------ CLI ------------------------
def main():
    # Configure logging
    logging.basicConfig(
        filename='scraper_log.txt',
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    print("Web Scraper CLI")
    print("---------------")
    print("Select scraper to run:")
    print("1. News Scraper")
    print("2. Product Scraper")
    print("3. Social Scraper")
    print("4. Exit")
    
    choice = input("Enter your choice (1-4): ").strip()
    
    if choice == '4':
        print("Exiting...")
        sys.exit(0)
    
    if choice not in ['1', '2', '3']:
        print("Invalid choice! Please select 1, 2, 3, or 4.")
        return

    url = input("Enter the URL to scrape (e.g., https://example.com): ").strip()
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url

    custom_selector = input("Enter a custom CSS selector (e.g., 'div.story h3') or press Enter to use defaults: ").strip() or None

    scraper = None
    if choice == '1':
        scraper = NewsScraper(url, custom_selector)
    elif choice == '2':
        scraper = ProductScraper(url, custom_selector)
    elif choice == '3':
        scraper = SocialScraper(url, custom_selector)

    try:
        scraper.fetch_data()  # Trigger the scraping
        scraper.log_data()    # Log the data for review

        if not scraper.data:
            print("No data was scraped. Try a different URL or custom selector.")
            print("Check 'scraper_log.txt' for details.")
            print("Tip: Inspect the website's HTML (F12 in browser) to find the correct CSS selector.")
            return

        # Clean the data
        cleaner = DataCleaner()
        key = list(scraper.data[0].keys())[0] if scraper.data else 'item'
        cleaned_data = [cleaner.clean_text(item[key]) for item in scraper.data]
        print("Cleaned Data:", cleaned_data)

        # Generate leads from the data
        lead_gen = LeadGenerator(scraper)
        leads = lead_gen.generate_leads()
        print("Generated Leads:", leads)

        # Save data to JSON and CSV
        scraper.save_data()
        scraper.to_csv()

    except Exception as e:
        print(f"An error occurred: {e}")
        logging.error(f"Main execution failed: {e}")
        print("Check 'scraper_log.txt' for details.")

if __name__ == "__main__":
    main()

Web Scraper CLI
---------------
Select scraper to run:
1. News Scraper
2. Product Scraper
3. Social Scraper
4. Exit
Cleaned Data: ['AdvertisementNewsNewsLIVEAustralian opposition party reeling after Albanese\'s landslide election winAnthony Albanese\'s left-leaning Labor Party has won an outright majority in parliament, according to the latest countingAustralia PM Albanese makes stunning comeback with landslide winDonald Trump loomed large over the election, with comparisons drawn between the US president and defeated opposition leader Peter Dutton.7 hrs agoAustralia\'Trump was the game-changer\': Three things to know about the Australian election resultThe BBC\'s Australia correspondent Katy Watson breaks down who won and lost the election - and what it means.14 hrs agoAustraliaHe wanted to be Australia\'s PM. But a \'Trump effect\' thwarted Peter DuttonA Trump effect and a lack of a coherent campaign saw opposition leader Peter Dutton lose the election.15 hrs agoAustraliaA look at ho