In [1]:
from datetime import datetime
from selenium import webdriver
from selenium.common.exceptions import *

import json
import re
import sys

# Tracking Scraper Exception

In [2]:
class TrackingScraperError(Exception):
    """
    Custom exception for the Tracking Scraper class.
    """
    
    def __init__(self, message, container = None, carrier = None, exception = None):
        # Call base class constructor
        super().__init__(message)
        
        # Declare custom attributes
        self.message = message
        self.container = container
        self.carrier = carrier
        self.datetime = datetime.today()
        self.exception = exception
    
    def __str__(self):
        # Generate base string with general info
        message = "{0}, carrier={1}, container={2}: {3}."
        message = message.format(str(self.datetime), self.carrier, self.container, self.message)
        
        # Generate custom message
        if self.exception is not None:
            message += " " + str(self.exception) #.replace("\n", "")
        
        return message

# Tracking Scraper selector switcher class

In [29]:
class TrackingSelectorSwitcher:
    """
    Tracking Scraper switcher for selecting and saving DOM elements and subelements in a tracking document.
    """
    
    def __init__(self, container, carrier, driver, document, attribute_group, parent_command, parent_element = None):
        self.__container = container
        self.__carrier = carrier
        self.__driver = driver
        self.__document = document
        self.__attribute_group = attribute_group
        self.__parent_command = parent_command
        self.__parent_element = driver if (parent_element is None) else parent_element
    
    @property
    def get_document(self):
        return self.__document
    
    ###########################################################################################################
    
    def process(self):
        """
        Get DOM element(s) based on the configuration selector element declared in initialization,
        then process or return them accordingly. Returns True if all commands were executed successfully,
        False if one command failed, or DOM element(s) if no commands were found.
        """
        
        # Get selector type
        selector_type = self.__parent_command.get("type")
        if selector_type is None:
            self._close_driver("Selector type not found")
        
        # Execute process based on selector type
        try:
            method = getattr(self, "_process_" + selector_type)
            return method()
        except AttributeError:
            self._close_driver("Process: Selector type " + selector_type + " is not valid.")
    
    ###########################################################################################################
    
    def _process_id(self):
        """
        Get DOM element by ID, then process it or return it depending on the parent command.
        Returns True if subcommand was executed successfully and False if subcommand failed, or returns
        DOM element if no subcommands were found.
        """
        
        # Get selector value
        selector_value = self.__selector_command.get("value")
        if selector_value is None:
            self._close_driver("Process by ID: Selector value not found.")
        
        # Get DOM element by ID
        dom_element = self.__parent_element.find_element_by_id(selector_value)
        
        return dom_element
    
    ###########################################################################################################
    
    def _process_class(self):
        """
        Get DOM elements by class name, then process them or return them depending on the parent command.
        """
        
        # Get class name
        class_name = self.__parent_command.get("value")
        if class_name is None:
            self._close_driver("Process by class name: Selector value not found.")
        
        # Get DOM elements by class name
        dom_elements = self.__parent_element.find_elements_by_class(class_name)
        
        # Get commands, if not found, return DOM elements
        commands = self.__parent_command.get("commands")
        if selector_commands is None:
            return dom_elements
        
        # Iterate through selector command elements
        for command in selector_commands:
            
            # Get index to use
            index = command.get("index")
            if index is None:
                self._close_driver("Process by class name: Index not found")
            
            # Get DOM element at index, if not found, return False
            try:
                dom_element = dom_elements[index]
            except IndexError:
                return False
    
    ###########################################################################################################
    
    def _process_css(self):
        """
        Get DOM elements by CSS selector, then
        """
        
        # Get CSS selector
        selector = self.__parent_command.get("value")
        if selector is None:
            self.close_driver("Process by CSS selector: Selector value not found.")
        
        # Get DOM elements by CSS selector
        dom_elements = self.__parent_element.find_elements_by_css_selector(selector)
        
        # Get selector command elements
        commands = self.__parent_command.get("commands")
        if commands is None:
            return dom_elements
        
        # Iterate through selector command elements
        for command in commands:
            print(command)
    
    ###########################################################################################################
    
    def _process_save(self):
        """
        Saves element to the document, according to specified attribute and attribute group.
        """
        
        # Get attribute value to save to
        attribute = self.__parent_command.get("value")
        if attribute is None:
            self._close_driver("Process save: Attribute to save not found")
        
        # Check if element is required
        is_required = self.__parent_command.get("required")
        if is_required is None:
            is_required = True
        
        # Get text to be saved
        saved_text = self.__parent_element
        try:
            # if value in "saved_text" is a DOM element, we need its text
            saved_text = saved_text.text
        except AttributeError:
            pass # value in "saved_text" is already a string
        
        # Verify if text is not empty
        if len(saved_text) == 0:
            return is_required
        
        # Save text into the document
        try:
            self.__document[self.__attribute_group][attribute] = saved_text
            return True
        except KeyError:
            self._close_driver("Process save: Attribute group or attribute not found")
    
    ###########################################################################################################
    
    def _close_driver(self, message = None):
        """
        Closes the Selenium Driver and raises an exception.
        """
        
        self.__driver.close()
        
        # Format error message
        error_message = "Selector(attribute_group={0}, parent_command={1}): {2}"
        error_message = error_message.format(self.__attribute_group, str(self.__parent_command), message)
        # Raise exception
        raise TrackingScraperError(error_message, self.__container, self.__carrier)

# Tracking Scraper main class

In [7]:
class TrackingScraper:
    """
    Web scraper for container information extraction.
    """
    
    def __init__(self, container, carrier):
        """
        Initializes Selenium WebDriver, the container tracking object, and configuration for the shipping carrier.
        """
        
        # Save main attributes
        self.__container = container
        self.__carrier = carrier
        
        # Open the Firefox WebDriver
        try:
            self.__driver = webdriver.Firefox()
        except WebDriverException as ex:
            raise TrackingScraperError("Error creating Selenium driver", container, carrier, ex)
        
        # Define tracking information
        self.__tracking_document = {
            "general": {
                "container_number": container
            },
            "movements": []
        }
        
        # Get tracking configuration information
        try:
            with open("../config/" + carrier + ".json") as config:
                self.__config = json.load(config)
        except FileNotFoundError:
            self.close_driver("Configuration file not found")
        except JSONDecodeError as ex:
            self.close_driver("JSON parsing error", ex)
        except Exception as ex:
            self.close_driver("Unknown error", ex)
    
    ###########################################################################################################
    
    def go_to_url(self):
        """
        Go to the URL specified in the configuration file.
        """
        
        # Get URL from configuration file
        try:
            url = self.__config["general"]["url"]
        except KeyError:
            self.close_driver("URL not found in configuration file")
        
        # Parse container if necessary, and go to page
        try:
            self.__driver.get(url.format(container = self.__container))
        except WebDriverException as ex:
            self.close_driver("Error ocurred while going to URL", ex)
    
    ###########################################################################################################

    def do_input_asserts(self):
        """
        Executes assertions in the page, before executing the required input commands.
        Returns True if everything correct, False otherwise.
        """
        
        # Get input assertion commands
        input_asserts = self.__config.get("input_assert")
        if input_asserts is None:
            return True
        
        # Iterate through input assertion commands
        for input_assert in input_asserts:
            print(input_assert)
    
    ###########################################################################################################
    
    def do_input_general_commands(self):
        """
        Executes input or action commands before sending a form or reading container/tracking information.
        """
        
        # Get input general commands
        input_commands = self.__config.get("input_general")
        if input_commands is None:
            return True
        
        # Iterate though input general commands
        for command in input_commands:
            selector = TrackingSelectorSwitcher(self.__driver, self.__tracking_document, "")
    
    ###########################################################################################################
    
    def close_driver(self, message = None, exception = None):
        """
        Closes the Selenium Driver. If arguments are specified, also raises an exception.
        """
        
        try:
            self.__driver.close()
        except InvalidSessionIdException:
            pass # Driver already closed
        
        if message is not None:
            raise TrackingScraperError(message, self.__container, self.__carrier, exception)
    
    def __del__(self):
        """
        Close the driver before garbage collection.
        """
        self.close_driver()

In [9]:
try:
    scraper = TrackingScraper("FSCU5670046", "Hapag-Lloyd")
    # scraper.go_to_url()
except TrackingScraperError as ex:
    print(ex)

In [11]:
scraper.close_driver()

In [1]:
class ContainerScraper:
    
    def _selector_finder_by_id (self):
        return self.driver.find_element_by_id
    
    selector_finder = {
        "id"    : self.driver.find_elements_by_id,
        "class" : self.driver.find_elements_by_class_name,
        "css"   : self.driver.find_elements_by_css_selector,
        "tag"   : self.driver.find_elements_by_tag_name,
        "xpath" : self.driver.find_elements_by_xpath
    }
    
    def __init__(self, carrier, container):
        self.driver = webdriver.Firefox()
        self.config = json.dumps(carrier + ".json")
        self.container = container
        self.tracking = {
            "general": {
                "container_number": container
            },
            "container": {
                "number": container
            },
            "last_route": {},
            "movements": []
        }
    
    def go_to_page (self):
        url = self.config["url"].format(container = self.container)
        self.driver.get(url)

    def save_general_info (self):
        general_info = self.config["output"]["general_info"]
        
        # Obtener el tipo de búsqueda
        selector_type = selector_types.get(general_info["type"])
        if (selector_type is None):
            return False
        
        # Obtener la lista de elementos
        elements = self.driver.find_elements(selector_types[general_info["type"]], general_info["selector"])
        
        # Guardar cada atributo definido según la información del elemento
        for attribute, index in general_info["attributes"].items():
            try:
                tracking[attribute] = elements[index].text
            except IndexError:
                return False
        return True
    
    """
    Private function that saves an attribute to the tracking dictionary from elements[index].
    Returns True if the attribute found and the attribute is required, True otherwise
    """
    def _save_general_attribute (self, attribute, elements, index = 0, required = True):
        try:
            self.tracking["general"][attribute] = elements[index].text
            return True
        except IndexError:
            return required

NameError: name 'self' is not defined

In [5]:
class ContainerScraper:
    
    selector_types = {
        "id"    : By.ID,
        "class" : By.CLASS_NAME,
        "css"   : By.CSS_SELECTOR,
        "tag"   : By.TAG_NAME,
        "xpath" : By.XPATH
    }
    
    def __init__(self, driver, container):
        self.driver = driver
        self.info = {
            "general": {
                "container_number": container
            },
            "movements": []
        }
    
    """
    Obtiene la lista de elementos DOM HTML en base a un selector y a un tipo de selector (CSS, ID, clase, etc.)
    """
    def __find_elements(self, key, selector):
        try:
            return self.driver.find_elements(selector_types[key], selector)
        except KeyError:
            return None
    
    """
    Obtiene el texto de un elemento DOM HTML en un índice específico.
    """
    def __get_element_text(self, elements, index = 0): #, required = True
        try:
            return elements[index].text
        except IndexError:
            return None

In [26]:
class Test:
    
    def asdf(self):
        return "asdf"
    
    def sdfg(self):
        return "sdfg"
    
    dictionary = {
        "asdf": asdf,
        "sdfg": sdfg
    }
    
    def execute(self, command):
        try:
            result = self.dictionary[command](self)
            print(result)
        except KeyError:
            print("Error: command not found")

In [28]:
Test().execute("sdfg")

sdfg
