In [None]:
import time
import pandas as pd

from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.selenium_manager import SeleniumManager
from selenium.common.exceptions import *

# Chrome Driver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Firefox Driver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options


class InfiniteScrollScraper:
    
    def __init__(self, site: str, xpath_selector):
        self.url = site
        self.data = []
        # doing it this way will make it easier to dynamically input new selectors or class names:
        self.class_xpath = f"//div[@class={xpath_selector}]"
        self.driver = self.setup_chrome_driver()
    
    @staticmethod
    def setup_chrome_driver(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in background
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        return driver
        
    def scroll_and_scrape(self):
        self.driver.get(self.url)
        last_height = self.driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down to the bottom.
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # Wait to load the page.
            time.sleep(3)

            # Calculate new scroll height and compare with last scroll height.
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

            # Scrape the data
            self.scrape_data()

    def scrape_data(self):
        items = self.driver.find_elements(By.CSS_SELECTOR, "selector_for_items")
        for item in items:
            name = item.find_element(By.CSS_SELECTOR, "selector_for_name").text
            address = item.find_element(By.CSS_SELECTOR, "selector_for_address").text
            email = item.find_element(By.CSS_SELECTOR, "selector_for_email").text
            phone = item.find_element(By.CSS_SELECTOR, "selector_for_phone").text
            website = item.find_element(By.CSS_SELECTOR, "selector_for_website").text
            self.data.append({
                "Name": name,
                "Address": address,
                "Email": email,
                "Phone": phone,
                "Website": website
            })

    def save_data(self):
        df = pd.DataFrame(self.data)
        df.to_csv("scraped_data.csv", index=False)

    def close_driver(self):
        self.driver.quit()

    def run(self):
        self.setup_driver()
        self.scroll_and_scrape()
        self.save_data()
        self.close_driver()


url = "https://www.google.com/maps/search/businesses+in+baton+rouge/"
scraper = InfiniteScrollScraper(url)
scraper.run()

In [15]:
# Data Structures and cleaners
import pandas as pd
import pandas_geojson as pdgeo
import pandas_datareader as pdr
import janitor as jn

# Data Storage and cleaning
from pymongo import MongoClient
import pymongoarrow as arrow

# web driver managers
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager

# Web Automation packages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.selenium_manager import SeleniumManager
from selenium.common.exceptions import *
from dotenv import load_dotenv

from utils import xpath_examples, element_attribute_examples

In [7]:
load_dotenv()

True

#### Create a Xpath Class:

> Doing this will make it so that you can instantiate multiple objects if you plan on using the scraper to extract more than one item from the site. 

Instead of inheriting them though, maybe just take in one param called Xpath that can accept a list. So even if the list has one item, it will still work. Plus make the program check to make sure that their is an Xpath entered, and if not exit.

> List of all the possible elements: `[
    "!DOCTYPE", "html", "head", "title", "base", "link", "meta", "style", "script", "noscript",
    "body", "section", "nav", "article", "aside", "h1", "h2", "h3", "h4", "h5", "h6",
    "header", "footer", "address", "main", "p", "hr", "pre", "blockquote", "ol", "ul",
    "li", "dl", "dt", "dd", "figure", "figcaption", "div", "a", "em", "strong", "small",
    "s", "cite", "q", "dfn", "abbr", "ruby", "rt", "rp", "data", "time", "code", "var",
    "samp", "kbd", "sub", "sup", "i", "b", "u", "mark", "bdi", "bdo", "span", "br", "wbr",
    "ins", "del", "image", "img", "iframe", "embed", "object", "param", "video", "audio",
    "source", "track", "canvas", "map", "area", "svg", "math", "table", "caption",
    "colgroup", "col", "tbody", "thead", "tfoot", "tr", "td", "th", "form", "fieldset",
    "legend", "label", "input", "button", "select", "datalist", "optgroup", "option",
    "textarea", "keygen", "output", "progress", "meter", "details", "summary", "menu",
    "menuitem", "dialog"
]
`

> List of possible attributes: `html_attributes = [
    "id", "class", "style", "title", "alt", "src", "href", "target", "rel",
    "type", "value", "name", "placeholder", "disabled", "checked", "readonly",
    "selected", "multiple", "required", "pattern", "min", "max", "step",
    "data-*", "aria-*", "role", "async", "defer", "srcset", "sizes", "hreflang",
    "charset", "autofocus", "autocomplete", "novalidate", "method", "action",
    "enctype", "formmethod", "formaction", "headers", "for", "form", "width",
    "height", "frameborder", "allow", "allowfullscreen", "autoplay", "loop",
    "muted", "controls", "download", "accesskey", "contenteditable", "dir",
    "draggable", "hidden", "lang", "spellcheck", "tabindex", "translate",
    "reversed", "start", "colspan", "rowspan", "headers", "scope", "align",
    "nowrap", "border", "cellpadding", "cellspacing", "summary", "usemap",
    "shape", "coords", "poster", "preload", "kind", "srclang", "sandbox",
    "integrity", "crossorigin", "referrerpolicy", "loading"
]`

In [ ]:
class XpathBuilder:
    
    """
    USe the xpath_examples and element_attribute_examples to build the XpathBuilder class.
    The xpath examples will be used to get users decision on what they want to do, 
    and the element_attribute_examples will be used to verify the users input on what they want to target.
    
    so display examples
    have user select a number 1 - 10 to choose the example.
    Then ask the user the element they need to extract.
    then ask if they need to extract an attribute.
    then ask if they need to extract a selector.
    insert the proper error exceptions and if and and statements to make sure
    this class is used properly.
    
    Once the class is instantiated, there should be an xpath stored in the xpath_objects list
    ready to be passed to the scraper object.
    """

    def __init__(self, element=None, attribute=None, selector=None):
        # if the user wants IXpath to build the Xpath for them then we need their input:
        if element is None and attribute is None and selector is None:
            # get prompt:
            self.element_instance_prompt = input("What type of element is the Xpath located under: ")
            
        self.element = element or self.ask_for_input("Element (e.g., div, span, a): ")
        self.attribute, self.selector = self.handle_element_specific_logic(self.element) if not attribute or not selector else (attribute, selector)
        self.xpath = self.build_xpath()

    def ask_for_input(self, prompt, optional=False):
        user_input = input(prompt).strip()
        if optional and user_input == "":
            return None
        while user_input == "":
            print("This field is required. Please enter a value.")
            user_input = input(prompt).strip()
        return user_input

    def handle_element_specific_logic(self, element):
        """
        Handles logic specific to certain elements, like <a> tags.
        """
        if element == 'a':
            print("You've selected an <a> element. What would you like to target?")
            print("1: href attribute")
            print("2: Text content")
            print("3: Class attribute")
            print("4: ID attribute")
            # Extend this list based on the options you want to provide.

            choice = self.ask_for_input("Enter the number of your choice: ")

            if choice == '1':
                return ('href', None)
            elif choice == '2':
                return (None, None)  # No attribute needed for text content, handle differently in build_xpath
            elif choice == '3':
                return ('class', self.ask_for_input("Enter the class name: "))
            elif choice == '4':
                return ('id', self.ask_for_input("Enter the ID: "))
            # Add more elif blocks for additional choices if necessary.
        # Implement additional if statements for other elements with unique options.

        # Default fallback for other elements or unspecified attributes.
        attribute = self.ask_for_input("Attribute (e.g., class, id, style): ", optional=True)
        selector = self.ask_for_input(f"Value of {attribute}: ", optional=True) if attribute else None
        return attribute, selector

    def build_xpath(self):
        """
        Builds the XPath based on the provided element, attribute, and selector.
        """
        if self.attribute and self.selector:
            return f"//{self.element}[@{self.attribute}='{self.selector}']"
        elif self.attribute:
            return f"//{self.element}[@{self.attribute}]"
        elif self.element == 'a' and self.attribute is None:
            return f"//{self.element}[text()='{self.selector}']"  # Handling text content for <a> specifically
        else:
            return f"//{self.element}"

    def __str__(self):
        """
        Returns the constructed XPath expression when the object is printed.
        """
        return self.xpath

In [ ]:
class IXpath:
    """
    Use it to instantiate an xpath object, or multiple xpath objects, for a website. Then pass
    the object(s) to scraper object(s) and extract the items at the xpath's locations.
    
    Example:
    "//div[@class='class-name']"
    """
    
    def __init__(self, element:str=None, attribute:str=None, selector:str=None):
        """
        Instantiates an Xpath object that can be passed to a scraper to scraper that site with.

        :param element: The site tag that the selector is located under (div, span, p, h{num}, a)
        :param attribute: The site tags identifier. These are used to identify a site element (class, id, href)
        :param selector: 
        
        Example: 
        element=div, attribute=class, selector='class-name'
        //{element}[@{attribute}={selector}]
        """
        self.xpath = f"//{element}[@{attribute}='{selector}']"

In [ ]:
class InfiniteScrollScraper:

    def __init__(self, site: str, xpath_element:str="div", xpath_attribute:str="class", xpath_selector:list=None):
        """
        The params are to make it so the user can dynamically choose their Xpath for each site 
        the scraper is used on. Also, multiple instances can be created.
        :param site: 
        :param xpath_element: 
        :param xpath_attribute: 
        :param xpath_selector: 
        """
        self.url = site
        self.data = []
        # doing it this way will make it easier to dynamically input new selectors or class names:
        self.xpath = f"//{xpath_element}[@{xpath_attribute}={xpath_selector}]"
        self.drive_type = None
        self.driver = self.setup_chrome_driver()

    @staticmethod
    def setup_chrome_driver():
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in background
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        return driver
    
    @staticmethod
    def setup_firefox_driver():
        firefox_options = Options()
        firefox_options.add_argument("--headless")
        driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=firefox_options)
        return driver

    def scroll_and_scrape(self):
        self.driver.get(self.url)
        last_height = self.driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down to the bottom.
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # Wait to load the page.
            time.sleep(3)

            # Calculate new scroll height and compare with last scroll height.
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

            # Scrape the data
            self.scrape_data()

    def scrape_data(self):
        items = self.driver.find_elements(By.CSS_SELECTOR, "selector_for_items")
        for item in items:
            name = item.find_element(By.CSS_SELECTOR, "selector_for_name").text
            address = item.find_element(By.CSS_SELECTOR, "selector_for_address").text
            email = item.find_element(By.CSS_SELECTOR, "selector_for_email").text
            phone = item.find_element(By.CSS_SELECTOR, "selector_for_phone").text
            website = item.find_element(By.CSS_SELECTOR, "selector_for_website").text
            self.data.append({
                "Name": name,
                "Address": address,
                "Email": email,
                "Phone": phone,
                "Website": website
            })

    def save_data(self):
        df = pd.DataFrame(self.data)
        df.to_csv("scraped_data.csv", index=False)

    def close_driver(self):
        self.driver.quit()

    def run(self):
        self.setup_driver()
        self.scroll_and_scrape()
        self.save_data()
        self.close_driver()

In [1]:
BlazeMongoDB = "mongodb+srv://cmwolfe:<password>@blaze-logistics.u33eshg.mongodb.net/?retryWrites=true&w=majority"

['Select all elements with `//tagName`',
 'Select all elements with `//tagName` with `attribute` with specific `value`: ',
 "Select all elements with `//tagName` containing specific 'text content': ",
 'Select nth `tagName` element in doc: ',
 'Select first|last `childTagName` within `parentTagName`: ',
 "Select elements that `attribute` = 'substring': ",
 'Select `tagName` ancestor|sibling element, showcasing XPath axes to navigate element relationships: ',
 'Use logical operators (and, or) to select elements that meet multiple conditions: ',
 'Selects all tags with `className`, accounting for potential multiple class names.',
 "Select elements without certain `attribute` or select 'attribute' that exceeds the len of `n`( demonstrating the use of XPath functions for more complex filtering) :"]