# Tracking Scraper

In [1]:
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from datetime import datetime, timedelta
from pymongo import MongoClient

import json
import logging
import re
import time
import sys

In [2]:
logging.basicConfig(filename = "../logs/scraper-20190325.log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["tracking_scraper"]

## Exception class and constants

In [3]:
class TrackingScraperError(Exception):
    """Custom exception for the Tracking Web Scraper."""
    pass

class TrackingScraperConfig:
    """Constants and basic configuration for the Tracking Web Scraper."""
    
    # Default timeout, in seconds
    DEFAULT_TIMEOUT         = 60
    # Default wait for long actions, in seconds
    DEFAULT_WAIT_LONG       = 5
    # Default wait for short actions, in seconds
    DEFAULT_WAIT_SHORT      = 1.5
    
    # Default value for the key "required" in all types
    DEFAULT_KEY_REQUIRED    = True
    # Default value for the key "action" in type "alert"
    DEFAULT_KEY_ACTION      = True
    # Default value for the key "wait" in type "click"
    DEFAULT_KEY_WAIT        = True
    # Default value for the key "clean" in type "write"
    DEFAULT_KEY_CLEAN       = False
    # Default value for the key "enter" in type "write"
    DEFAULT_KEY_ENTER       = False
    
    # Default thousand separator symbol
    DEFAULT_THOUSAND_SYMBOL = ","
    # Default datetime locale information
    DEFAULT_DATETIME_LOCALE = {
        "hours": -5
    }

## Selector class

In [4]:
class TrackingScraperSwitcher:
    """
    Switcher for selecting and saving Web elements and subelements in a tracking-related document.
    """
    
    def __init__(self, driver, document, configuration, parent_command, parent_element = None):
        self.__driver         = driver
        self.__document       = document
        self.__configuration  = configuration
        self.__parent_command = parent_command
        self.__parent_element = driver if parent_element is None else parent_element
    
    @property
    def document(self):
        """Returns the stored tracking-related dictionary."""
        return self.__document
    
    ###############################################################################################
    
    def process(self):
        """
        Get Web elements based on the current configuration command, then process or return them
        accordingly. Returns True if all commands and subcommands were executed successfully,
        False if one command failed, or the list of Web elements if no subcommands were found.
        """
        # Get process type
        process_type = self.__parent_command.get("type")
        if process_type is None:
            raise TrackingScraperError("Process type not found")
        logging.info("Process type: %s", process_type)
        
        # Execute process based on process type
        try:
            method = getattr(self, "_process_" + process_type)
            return method()
        except AttributeError:
            raise TrackingScraperError("Process type " + process_type + " is not valid")
        except TypeError:
            raise TrackingScraperError("Process type " + process_type + " can't be directly invoked")
    
    ###############################################################################################
    
    def _process_id(self):
        return self.__process_dom_elements(By.ID)
    def _process_class(self):
        return self.__process_dom_elements(By.CLASS_NAME)
    def _process_css(self):
        return self.__process_dom_elements(By.CSS_SELECTOR)
    def _process_name(self):
        return self.__process_dom_elements(By.NAME)
    def _process_tag(self):
        return self.__process_dom_elements(By.TAG_NAME)
    def _process_xpath(self):
        return self.__process_dom_elements(By.XPATH)
    
    def __process_dom_elements(self, selector_type):
        # Get selector
        selector = self.__parent_command.get("selector")
        if selector is None:
            raise TrackingScraperError("Selector not found in process by " + selector_type)
        
        # Check assertions
        assertions = self.__check_assertions(selector_type, selector)
        if assertions is True:
            return True
        
        # Get DOM elements
        dom_elements = self.__parent_element.find_elements(selector_type, selector)
        
        # Check requirements
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if len(dom_elements) == 0:
            logging.info("No elements found, using required")
            return not required
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, dom_elements)
        
        # Get a child command for all elements and process them, if possible
        child_command = self.__parent_command.get("command")
        if child_command is not None:
            for child_element in dom_elements:
                result = self.__generate_child_process(child_command, child_element)
                if result is not True:
                    return result
            return True
        
        # If no single child command was found, return all DOM elements
        logging.info("No commands found, return")
        return dom_elements
    
    def __check_assertions(self):
        assertion = self.__parent_command.get("assert")
        
        # Assert at least one element found
        if assertion is True:
            try:
                WebDriverWait(self.__driver, TrackingScraperConfig.DEFAULT_TIMEOUT,
                              TrackingScraperConfig.DEFAULT_WAIT_SHORT).until(
                    EC.presence_of_all_elements_located((selector_type, selector)))
            except TimeoutException:
                raise TrackingScraperError("Assertion error: Elements unexpectedly not found")
            
            time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
            return True
        
        # Assert no elements found
        if assertion is False:
            try:
                WebDriverWait(self.__driver, TrackingScraperConfig.DEFAULT_TIMEOUT,
                              TrackingScraperConfig.DEFAULT_WAIT_SHORT).until_not(
                    EC.presence_of_all_elements_located((selector_type, selector)))
            except TimeoutException:
                raise TrackingScraperError("Assertion error: Elements unexpectedly found")
            
            time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
            return True
        
        return False
    
    def __process_child_commands(self, commands, elements):
        for child_command in commands:
            # Get index
            index = child_command.get("index")
            if index is None:
                raise TrackingScraperError("Child index command not found")
            
            # Check requirements
            if index >= len(elements):
                logging.info("Child element at index " + index + ", using required")
                return not child_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
            
            # Process child element at specified index
            logging.info("Child index: %d", index)
            result = self.__generate_child_process(child_command, elements[index])
            
            # If no subelements were found, return that element or element list
            # If a minor error occured (e.g. element not found), return False
            if result is not True:
                return result
        
        # If everything was fine, return True
        return True
    
    def __generate_child_process(self, child_command, child_element):
        return TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                       child_command, child_element).process()
    
    ###############################################################################################
    
    def _process_split(self):
        # Get text to split
        parent_text = self.__get_parent_text()
        
        # Get text separator
        delimiter = self.__parent_command.get("delimiter")
        if delimiter is None:
            raise TrackingScraperError("No separator found")
        
        # Split text
        elements = parent_text.split(delimiter)
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return split list
        return elements
    
    def __get_parent_text(self):
        parent_text = self.__parent_element
        try:
            return parent_text.text.strip() # value is a DOM element, we need its inner text
        except AttributeError:
            return parent_text.strip() # value is already a string
    
    ###############################################################################################
    
    def _process_regex(self):
        # Get text
        text = self.__get_parent_text()
        
        # Get regular expression pattern
        pattern = self.__parent_command.get("pattern")
        if pattern is None:
            raise TrackingScraperError("No regular expression found")
        
        # Match expression with text
        regex    = re.search(pattern, text)
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if regex is None:
            logging.info("Regular expression does not match text, using required")
            return not required
        
        # Get list of matched elements
        elements = list(regex.groups())
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return list of matched elements
        return elements
    
    ###############################################################################################
    
    def _process_save(self):
        attribute = self.__parent_command.get("key")
        if attribute is None:
            raise TrackingScraperError("Save key not found")
        
        # If a value was already defined, save it and exit
        value = self.__parent_command.get("value")
        if value is not None:
            self.__document[attribute] = value
            return True
        
        # Get text to be saved, and verify if it's not empty
        value    = self.__get_parent_text()
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if len(value) == 0:
            logging.info("Text to save is empty, using required")
            return not required
        
        # Format type if necessary
        format_type = self.__parent_command.get("format")
        if format_type is not None:
            value = TrackingScraperConverter(value, format_type, self.__configuration).convert()
        
        #  Save according to parent key and formatting value
        self.__document[attribute] = value
        return True
    
    ###############################################################################################
    
    def _process_compare(self):
        # Get text to compare
        text = self.__get_parent_text()
        
        # Get values to compare
        values = self.__parent_command.get("values")
        if values is not None:
            raise TrackingScraperError("Values to compare not found")
        
        # Check if text equals to value, or if it is in value list, then act accordingly
        if text in values:
            commands = self.__parent_command.get("success")
            return self.__process_compare_commands(commands, "Success")
        else:
            commands = self.__parent_command.get("failure")
            return self.__process_compare_commands(commands, "Failure")
    
    def __process_compare_commands(self, commands, compare_result):
        # Check requirements
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if commands is None:
            logging.info(compare_result + " commands not found, resorting to required")
            return not required
        
        # Process child commands
        for child_command in commands:
            result = self.__generate_child_process(child_command, self.__parent_element)
            if result is not True:
                return result
        return True
    
    ###############################################################################################
    
    def _process_write(self):
        # Get value
        value = self.__parent_command.get("value")
        if value is None:
            # Get value from attribute
            attribute = self.__parent_command.get("attribute")
            if attribute is None:
                raise TrackingScraperError("No value or attribute to use as input")
            value = self.__document.get(attribute)
        
        try:
            # Clear element if specified
            if self.__parent_command.get("clean", TrackingScraperConfig.DEFAULT_KEY_CLEAN):
                self.__parent_element.clear()
            # Write value
            self.__parent_element.send_keys(value)
            # Send enter if specified
            if self.__parent_command.get("enter", TrackingScraperConfig.DEFAULT_KEY_ENTER):
                self.__parent_element.send_keys(Keys.ENTER)
        except AttributeError:
            raise TrackingScraperError("Element is not interactable (attribute)")
        except ElementNotInteractableException:
            raise TrackingScraperError("Element is not interactable (selenium)")
        
        # Return True to indicate everything is OK
        time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
        return True
    
    ###############################################################################################
    
    def _process_alert(self):
        assertion = self.__parent_command.get("assertion")
        try:
            # Try to switch to alert
            alert = self.__driver.switch_to.alert
            if assertion is False:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly found")
            # Accept or dismiss action depending on command
            if self.__parent_command.get("action", TrackingScraperConfig.DEFAULT_KEY_ACTION):
                alert.accept()
            else:
                alert.dismiss()
        except NoAlertPresentException:
            if assertion is True:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly not found")
        
        # Return True to indicate everything is OK
        return True
    
    ###############################################################################################
    
    def _process_click(self):
        # Check requirements
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if not self.__parent_element.is_displayed():
            return not required
        if not self.__parent_element.is_enabled():
            return not required
        
        try:
            # Try to click the element
            self.__parent_element.click()
            
            # Wait 2 or 5 seconds depending on "wait" attribute
            wait_time = self.__parent_command.get("wait", TrackingScraperConfig.DEFAULT_KEY_WAIT)
            if wait_time:
                time.sleep(TrackingScraperConfig.DEFAULT_WAIT_LONG)
            else:
                time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
                
            # Return True to indicate everything is OK
            return True
        except ElementNotInteractableException:
            return not required
    
    ###############################################################################################
    
    def _process_ocr(self):
        length = self.__parent_command.get("length")
        if length is None:
            raise TrackingScraperError("Text length not defined")
        
        # Request text
        text = input("Enter captcha text: ")
        if len(text) != length:
            raise TrackingScraperError("Text is not " + length + " characters long")
        
        # Save to attribute
        self.__document["ocr"] = text
        return True

## Converter class

In [5]:
class TrackingScraperConverter:
    """Utility class to convert text to other Python types."""
    
    def __init__(self, raw_text, format_type, configuration):
        self.__raw_text      = raw_text
        self.__format_type   = format_type
        self.__configuration = configuration
    
    def convert(self):
        """Try to convert to the desired type, if none found, return text as-is."""
        try:
            method = getattr(self, "_convert_to_" + self.__format_type)
            return method()
        except AttributeError:
            logging.info("Convertion to " + self.__format_type + " not supported, resorting to text")
            return self.__raw_text
        except TypeError:
            raise TrackingScraperError("Convertion to " + self.__format_type + " cannot be invoked")
    
    def _convert_to_int(self):
        """Convert text to an integer."""
        try:
            return int(self.__raw_text.replace(TrackingScraperConfig.DEFAULT_THOUSAND_SYMBOL, ""))
        except ValueError:
            logging.info("Convertion to integer failed, resorting to text")
            return self.__raw_text
    
    def _convert_to_float(self):
        """Convert text to a double-precision floating-point number."""
        try:
            return float(self.__raw_text.replace(TrackingScraperConfig.DEFAULT_THOUSAND_SYMBOL, ""))
        except ValueError:
            logging.info("Convertion to float failed, resorting to text")
            return self.__raw_text
    
    def _convert_to_double(self):
        # Alias for self._convert_to_float().
        return self._convert_to_float()
    
    def _convert_to_datetime(self):
        """Convert text to a Python datetime object."""
        # Get datetime patterns
        try:
            patterns = self.__configuration["general"]["datetimes"]
        except KeyError:
            logging.info("Datetime patterns not found, resorting to text")
            return self.__raw_text
        
        # Try each pattern until it matches one
        for pattern in patterns:
            try:
                return datetime.strptime(self.__raw_text, pattern)
            except ValueError:
                continue
        
        # If none of the patterns matched, return text as-is
        logging.info("None of the patterns matched, resorting to text")
        return self.__raw_text
    
    def _convert_to_date(self):
        """Convert text to a Python date object."""
        value = self._convert_to_datetime()
        if isinstance(value, datetime):
            return value.date()
        return value
    
    def _convert_to_time(self):
        """Convert text to a Python time object."""
        value = self._convert_to_datetime()
        if isinstance(value, datetime):
            return value.time()
        return value
    
    def _convert_to_local_datetime(self):
        """Convert text to a Python datetime object taking the defined locale into account."""
        value = self._convert_to_datetime()
        if isinstance(value, datetime):
            return value - timedelta(**TrackingScraperConfig.DEFAULT_DATETIME_LOCALE)
        return value
    
    def _convert_to_local_time(self):
        """Convert text to a Python time object taking the defined locale into account."""
        value = self._convert_to_local_datetime()
        if isinstance(value, datetime):
            return value.time()
        return value

## Main scraper class

In [6]:
class TrackingScraper:
    """Main class for the Tracking Web Scraper."""
    
    def __init__(self, document):
        self.__document = document
        
        # Initialize WebDriver
        try:
            self.__driver = webdriver.Chrome(executable_path = "../driver/chromedriver")
        except WebDriverException as ex:
            raise TrackingScraperError("Error creating Selenium driver. " + str(ex))
            
        # Get configuration file
        try:
            with open("../config/" + self.__document["carrier"] + ".json") as file:
                self.__configuration = json.load(file)
        except KeyError:
            self.__driver.close()
            raise TrackingScraperError("Carrier not found")
        except FileNotFoundError:
            self.__driver.close()
            raise TrackingScraperError("Configuration file not found")
        except json.JSONDecodeError as ex:
            self.__driver.close()
            raise TrackingScraperError("Configuration file could not be read: " + str(ex))
    
    ###############################################################################################
    
    def execute(self):
        parent_result   = False
        input_result    = False
        single_result   = False
        multiple_result = False
        
        try:
            start = self._go_to_url()
            while True:
                # Check if we're still on time
                end = time.time()
                if (end - start) > TrackingScraperConfig.DEFAULT_TIMEOUT:
                    raise TrackingScraperError("Timeout exceeded, scraping was unsuccessful")
                
                # Execute input
                input_result = self._execute_commands(input_result, "input")
                if input_result is not True:
                    logging.info("Input execution was unsuccessful, retrying...")
                    continue
                
                # Execute single output
                single_result = self._execute_commands(single_result, "single")
                if single_result is not True:
                    logging.info("Single output execution was unsuccessful, retrying...")
                    continue
                
                # Execute multiple output
                multiple_result = self._execute_multiple_output(multiple_result)
                if multiple_result is not True:
                    logging.info("Multiple output execution was unsuccessful, retrying...")
                    continue
                
                # Return True if everything executed correctly
                parent_result = True
                break
        finally:
            self.__driver.close()
            return parent_result
    
    ###############################################################################################
    
    def _go_to_url(self):
        try:
            link = self.__configuration["general"]["url"]
            self.__driver.get(link.format(**self.__document))
            time.sleep(TrackingScraperConfig.DEFAULT_WAIT_LONG)
        except KeyError:
            raise TrackingScraperError("Configuration URL could not be found")
        except TimeoutException:
            raise TrackingScraperError("Error loading Web page, timeout exceeded")
        
        # Start time
        return time.time()
    
    ###############################################################################################
    
    def _execute_commands(self, parent_result, key):
        # Check if commands were already executed
        if parent_result is True:
            return True
        
        # Get commands, if none found, return True
        commands = self.__configuration.get(key)
        if commands is None:
            return True
        
        # Process parent commands
        for command in commands:
            result = TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                             command).process()
            if result is not True:
                return False
        
        # Return True if everything was OK
        return True
    
    ###############################################################################################
    
    def _execute_multiple_output(self, multiple_result):
        if multiple_result is True:
            return True
        
        # Get multiple command, if none found, return True
        multiple = self.__configuration.get("multiple")
        if multiple is None:
            return True
        
        # Overwrite previous tracking items, if necessary
        if multiple_node.get("overwrite", TrackingScraperConfig.DEFAULT_KEY_OVERWRITE):
            self.__document["tracking"] = []
        
        # Generate and process multiple documents
        multiple_document = {}
        return self.__iterate_multiple_elements(multiple_command, multiple_document)
    
    def __iterate_multiple_elements(self, multiple_command, multiple_document = {},
                                   previous_element = None):
        # Get single commands
        multiple_single_commands = multiple_command.get("single")
        if not isinstance(multiple_single_commands, list):
            raise TrackingScraperError("Multiple command must have single commands key")
        
        # Get parent elements, if none found, use driver to extract single commands
        multiple_parents = multiple_command.get("parents")
        if multiple_parents is None:
            multiple_elements = [self.__driver]
        else:
            multiple_elements = TrackingScraperSwitcher(self.__driver, {}, self.__configuration,
                                                        multiple_parents, previous_element)
            if not isinstance(multiple_elements, list):
                raise TrackingScraperError("Parent elements must be a list of web elements")
        
        # Iterate through multiple elements
        # (driver, document, configuration, parent_command, parent_element = None)
        for element in multiple_elements:
            result = TrackingScraperSwitcher(self.__driver, multiple_document,
                                             self.__configuration, child_command)
            if result is not True:
                return False
        return True
    
    ###############################################################################################
    
    def _insert_or_update(self):
        pass

## Tests

In [7]:
container = {
    "year": "test",
    "manifest": "test",
    "detail": "test",
    "container": "EGSU9089973",
    "carrier": "Evergreen"
}

# Hapag-Lloyd: FSCU5670046, HLXU5183586
# Maersk: MAEU6835658
# Evergreen: EGSU9089973

In [8]:
try:
    scraper = TrackingScraper(container)
    scraper.execute()
    print("Finished")
except TrackingScraperError:
    logging.exception("Exception ocurred")
    print("Finished with errors")
except Exception:
    logging.exception("Unknown exception ocurred")
    print("Finished with unknown exception")

Enter captcha text: LQTB


In [9]:
container

{'carrier': 'Evergreen',
 'container': 'EGSU9089973',
 'detail': 'test',
 'estimated_arrival': datetime.date(2019, 4, 11),
 'manifest': 'test',
 'ocr': 'LQTB',
 'type': "40'(SH)",
 'vessel_voyage': 'EVER LAMBENT 0403-037W',
 'year': 'test'}