In [3]:
from datetime import datetime
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By

import json
import re
import sys

# Tracking Scraper Exception

In [4]:
class TrackingScraperError(Exception):
    """
    Custom exception for the Tracking Scraper class.
    """
    
    def __init__(self, message, container = None, carrier = None, exception = None):
        # Call base class constructor
        super().__init__(message)
        
        # Declare custom attributes
        self.message = message
        self.container = container
        self.carrier = carrier
        self.datetime = datetime.today()
        self.exception = exception
    
    def __str__(self):
        # Generate base string with general info
        message = "{0}, carrier={1}, container={2}: {3}."
        message = message.format(str(self.datetime), self.carrier, self.container, self.message)
        
        # Generate custom message
        if self.exception is not None:
            message += " " + str(self.exception) #.replace("\n", "")
        
        return message

# Tracking Scraper selector switcher class

In [39]:
class TrackingSelectorSwitcher:
    """
    Tracking Scraper switcher for selecting and saving DOM elements and subelements in a tracking document.
    """
    
    def __init__(self, container, carrier, driver, document, attribute_group, parent_command, parent_element = None):
        self.__container = container
        self.__carrier = carrier
        self.__driver = driver
        self.__document = document
        self.__attribute_group = attribute_group
        self.__parent_command = parent_command
        self.__parent_element = driver if (parent_element is None) else parent_element
    
    @property
    def get_document(self):
        """
        Returns the stored tracking-related dictionary.
        """
        return self.__document
    
    ###########################################################################################################
    
    def process(self):
        """
        Get DOM element(s) based on the configuration selector element declared in initialization,
        then process or return them accordingly. Returns True if all commands were executed successfully,
        False if one command failed, or DOM element(s) if no commands were found.
        """
        
        # Get selector type
        selector_type = self.__parent_command.get("type")
        if selector_type is None:
            self._close_driver("Selector type not found")
        
        # Execute process based on selector type
        try:
            method = getattr(self, "_process_" + selector_type)
            return method()
        except AttributeError:
            self._close_driver("Process: Selector type " + selector_type + " is not valid.")
    
    ###########################################################################################################
    
    def _process_id(self):
        """
        Get DOM element by ID, then process it or return it depending on the parent command.
        """
        
        # Get selector value
        selector_value = self.__selector_command.get("value")
        if selector_value is None:
            self._close_driver("Process by ID: Selector value not found.")
        
        # Get DOM element by ID
        dom_element = self.__parent_element.find_element_by_id(selector_value)
            
        # Get child command, if it's not declared, return DOM element
        child_command = self.__parent_command.get("command")
        if child_command is None:
            return dom_element
        
        # Create child selector switcher and return its process result
        child_selector = TrackingSelectorSwitcher(self.__container, self.__carrier, self.__driver, self.__document,
                                                  self.__attribute_group, child_command, dom_element)
        return child_selector.process()
    
    ###########################################################################################################
    
    def _process_class(self):
        """
        Get DOM elements by class name, then process them or return them depending on the parent command.
        """
        return self._process_dom_elements(By.CLASS_NAME)
    
    def _process_css(self):
        """
        Get DOM elements by CSS selector, then process them or return them depending on the parent command.
        """
        return self._process_dom_elements(By.CSS_SELECTOR)
    
    def _process_tag(self):
        """
        Get DOM elements by tag name, then process them or return them depending on the parent command.
        """
        return self._process_dom_elements(By.TAG_NAME)
    
    def _process_name(self):
        """
        Get DOM elements by "name" attribute, then process them or return them depending on the parent command.
        """
        return self._process_dom_elements(By.NAME)
    
    def _process_xpath(self):
        """
        Get DOM elements by XPath, then process them or return them depending on the parent command.
        """
        return self._process_dom_elements(By.XPATH)
    
    def _process_dom_elements(self, by_attribute):
        """
        Get DOM elements by a specified By attribute, then process them or return them depending on the parent command.
        """
        
        # Get CSS selector
        selector = self.__parent_command.get("value")
        if selector is None:
            self._close_driver("Process by " + by_attribute + ": Selector value not found.")
        
        # Get DOM elements by specified attribute
        dom_elements = self.__parent_element.find_elements(by_attribute, selector)
        
        # Get subcommands, if none are declared, return DOM elements
        commands = self.__parent_command.get("commands")
        if commands is None:
            return dom_elements
        
        # Iterate through selector command elements
        for child_command in commands:
            print("Subcommand:", child_command)
            
            # Get element index to use
            dom_index = child_command.get("index")
            if dom_index is None:
                self._close_driver("Process by " + by_attribute + ": Subcommand index not found")
            
            # Check if element is required, if not declared assume true
            is_required = child_command.get("required", True)
            
            # Get DOM element at index, if not found, return True or False depending if element was required
            try:
                child_element = dom_elements[dom_index]
            except IndexError:
                print("Child element at index " + dom_index + " was not found.")
                return is_required
            
            # Create child selector switcher
            child_selector = TrackingSelectorSwitcher(self.__container, self.__carrier, self.__driver, self.__document,
                                                      self.__attribute_group, child_command, child_element)
            
            # Process child selector switcher
            child_result = child_selector.process()
            if child_result is not True:
                # if no subelements were found, return that element or element list
                # if a minor error occured (eg. element not found), return False
                return child_result
        
        # If everything was fine, return True
        return True
    
    ###########################################################################################################
    
    def _process_split(self):
        return True
    
    def _process_regex(self):
        return True
    
    ###########################################################################################################
    
    def _process_save(self):
        """
        Saves element to the document, according to specified attribute and attribute group.
        """
        
        # Get attribute value to save to
        attribute = self.__parent_command.get("value")
        if attribute is None:
            self._close_driver("Process save: Attribute to save not found")
        
        # Check if element is required
        is_required = self.__parent_command.get("required", True)
        
        # Get text to be saved
        saved_text = self.__parent_element
        try:
            # if value in "saved_text" is a DOM element, we need its text
            saved_text = saved_text.text
        except AttributeError:
            pass # value in "saved_text" is already a string
        
        # Verify if text is not empty
        if len(saved_text) == 0:
            return is_required
        
        # Save text into the document
        try:
            self.__document[self.__attribute_group][attribute] = saved_text
            return True
        except KeyError:
            self._close_driver("Process save: Attribute group or attribute not found")
    
    ###########################################################################################################
    
    def _close_driver(self, message = None):
        """
        Closes the Selenium Driver and raises an exception.
        """
        
        self.__driver.close()
        
        # Format error message
        error_message = "Selector(attribute_group={0}, parent_command={1}): {2}"
        error_message = error_message.format(self.__attribute_group, str(self.__parent_command), message)
        # Raise exception
        raise TrackingScraperError(error_message, self.__container, self.__carrier)

# Tracking Scraper main class

In [40]:
class TrackingScraper:
    """
    Web scraper for container information extraction.
    """
    
    def __init__(self, container, carrier):
        """
        Initializes Selenium WebDriver, the container tracking object, and configuration for the shipping carrier.
        """
        
        # Save main attributes
        self.__container = container
        self.__carrier = carrier
        
        # Open the Firefox WebDriver
        try:
            self.__driver = webdriver.Firefox()
        except WebDriverException as ex:
            raise TrackingScraperError("Error creating Selenium driver", container, carrier, ex)
        
        # Define tracking information
        self.__tracking_document = {
            "output_general": {
                "container_number": container
            },
            "output_movements": []
        }
        
        # Get tracking configuration information
        try:
            with open("../config/" + carrier + ".json") as config:
                self.__config = json.load(config)
        except FileNotFoundError:
            self._close_driver("Configuration file not found")
        except JSONDecodeError as ex:
            self._close_driver("JSON parsing error", ex)
        except Exception as ex:
            self._close_driver("Unknown error", ex)
    
    ###########################################################################################################
    
    def go_to_url(self):
        """
        Go to the URL specified in the configuration file.
        """
        
        # Get URL from configuration file
        try:
            url = self.__config["general"]["url"]
        except KeyError:
            self._close_driver("URL not found in configuration file")
        
        # Parse container if necessary, and go to page
        try:
            self.__driver.get(url.format(container = self.__container))
        except WebDriverException as ex:
            self._close_driver("Error ocurred while going to URL", ex)
    
    ###########################################################################################################

    def do_input_commands(self):
        """
        Executes input commands in the page, before executing the required output commands.
        Returns True if everything was executed successfully, False otherwise.
        """
        
        # Get input assertion commands
        input_asserts = self.__config.get("input_assert")
        if input_asserts is None:
            return True
        
        # Iterate through input assertion commands
        for input_assert in input_asserts:
            print(input_assert)
    
    ###########################################################################################################
    
    def do_output_general_commands(self):
        """
        Executes output general commands before sending a form or reading container/tracking information.
        Returns True if everything was executed successfully, False otherwise.
        """
        
        # Get output general commands
        output_commands = self.__config.get("output_general")
        if output_commands is None:
            return True
        
        # Iterate though output general commands
        for command in output_commands:
            selector = TrackingSelectorSwitcher(self.__container, self.__carrier, self.__driver,
                                                self.__tracking_document, "output_general", command)
            print("New command result: ", selector.process())
    
    @property
    def get_tracking_object(self):
        return self.__tracking_document
    
    ###########################################################################################################
    
    def _close_driver(self, message = None, exception = None):
        """
        Closes the Selenium Driver. If arguments are specified, also raises an exception.
        """
        
        try:
            self.__driver.close()
        except InvalidSessionIdException:
            pass # Driver already closed
        
        if message is not None:
            raise TrackingScraperError(message, self.__container, self.__carrier, exception)
    
    def __del__(self):
        """
        Close the driver before garbage collection.
        """
        self._close_driver()

In [41]:
try:
    scraper = TrackingScraper("FSCU5670046", "Hapag-Lloyd")
    scraper.go_to_url()
except TrackingScraperError as ex:
    print(ex)

In [42]:
scraper.do_output_general_commands()

Subcommand: {'value': 'container_type', 'required': False, 'type': 'save', 'index': 0}
Subcommand: {'value': 'container_description', 'required': True, 'type': 'save', 'index': 1}
Subcommand: {'value': ' X ', 'type': 'split', 'index': 2, 'commands': [{'value': 'container_length', 'required': False, 'type': 'save', 'index': 0}, {'value': 'container_height', 'required': False, 'type': 'save', 'index': 2}, {'value': 'container_width', 'required': False, 'type': 'save', 'index': 1}]}
Subcommand: {'value': 'container_tare', 'required': False, 'type': 'save', 'index': 3}
Subcommand: {'value': 'container_max_payload', 'required': False, 'type': 'save', 'index': 4}
Subcommand: {'value': 'The container (.*?) (in|from) (.*?) at (.*?) ', 'type': 'regex', 'index': 5, 'commands': [{'value': 'last_status', 'required': True, 'type': 'save', 'index': 0}, {'value': 'last_location', 'required': True, 'type': 'save', 'index': 2}, {'value': 'last_date', 'required': True, 'type': 'save', 'index': 3}]}
New 

In [43]:
print(scraper.get_tracking_object)

{'output_movements': [], 'output_general': {'container_type': '45RT', 'container_tare': '4640', 'container_description': 'REEFER CONTAINER', 'container_number': 'FSCU5670046', 'container_max_payload': '29360'}}


In [44]:
scraper._close_driver()