# Tracking Scraper

In [1]:
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from pymongo import MongoClient

import datetime
import json
import logging
import re
import time
import sys

In [2]:
logging.basicConfig(filename = "../logs/scraper-20190325.log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["tracking_scraper"]

## Exception class

In [3]:
class TrackingScraperError(Exception):
    pass

## Selector class

In [4]:
class TrackingScraperSwitcher:
    """
    Switcher for selecting and saving Web elements and subelements in a tracking-related document.
    """
    __DEFAULT_TIMEOUT      = 30
    __DEFAULT_WAIT_LONG    = 5
    __DEFAULT_WAIT_SHORT   = 2
    __DEFAULT_KEY_ACTION   = True
    __DEFAULT_KEY_REQUIRED = True
    __DEFAULT_KEY_WAIT     = True
    
    def __init__(self, driver, document, configuration, parent_command, parent_element = None):
        self.__driver         = driver
        self.__document       = document
        self.__configuration  = configuration
        self.__parent_command = parent_command
        self.__parent_element = driver if parent_element is None else parent_element
    
    @property
    def document(self):
        """Returns the stored tracking-related dictionary."""
        return self.__document
    
    ###############################################################################################
    
    def process(self):
        """
        Get Web elements based on the current configuration command, then process or return them
        accordingly. Returns True if all commands and subcommands were executed successfully,
        False if one command failed, or the list of Web elements if no subcommands were found.
        """
        # Get process type
        process_type = self.__parent_command.get("type")
        if process_type is None:
            raise TrackingScraperError("Process type not found")
        logging.info("Process type: %s", process_type)
        
        # Execute process based on process type
        try:
            method = getattr(self, "_process_" + process_type)
            return method()
        except AttributeError:
            raise TrackingScraperError("Process type " + process_type + " is not valid")
        except TypeError:
            raise TrackingScraperError("Process type " + process_type + " cannot be invoked")
    
    ###############################################################################################
    
    def _process_id(self):
        return self._process_dom_elements(By.ID)
    def _process_class(self):
        return self._process_dom_elements(By.CLASS_NAME)
    def _process_css(self):
        return self._process_dom_elements(By.CSS_SELECTOR)
    def _process_name(self):
        return self._process_dom_elements(By.NAME)
    def _process_tag(self):
        return self._process_dom_elements(By.TAG_NAME)
    def _process_xpath(self):
        return self._process_dom_elements(By.XPATH)
    
    def _process_dom_elements(self, selector_type):
        # Get selector
        selector = self.__parent_command.get("selector")
        if selector is None:
            raise TrackingScraperError("Selector not found in process by " + selector_type)
        
        # Get DOM elements depending on assertion
        assertion = self.__parent_command.get("assert")
        if assertion is True: # Assert at least one element found
            try:
                dom_elements = WebDriverWait(driver, self.__DEFAULT_TIMEOUT).until(
                    EC.presence_of_all_elements_located((selector_type, selector)))
            except TimeoutException:
                raise TrackingScraperError("Assertion error: Elements unexpectedly not found")
        if assertion is False and len(dom_elements) > 0: # Assert no elements found
            raise TrackingScraperError("Assertion error: Elements unexpectedly found")
        
        # Check requirements
        if len(dom_elements) == 0:
            logging.info("No elements found, using required")
            return not self.__parent_command.get("required", self.__DEFAULT_KEY_REQUIRED)
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, dom_elements)
        
        # Get a child command for all elements and process them, if possible
        child_command = self.__parent_command.get("command")
        if child_command is not None:
            for child_element in dom_elements:
                result = self.__generate_child_process(child_command, child_element)
                if result is not True:
                    return result
            return True
        
        # If no single child command was found, return all DOM elements
        return dom_elements
    
    def __process_child_commands(self, commands, elements):
        for child_command in commands:
            # Get index
            index = child_command.get("index")
            if index is None:
                raise TrackingScraperError("Child index command not found")
            
            # Check requirements
            if index >= len(elements):
                logging.info("Child element at index " + index + ", using required")
                return not child_command.get("required", self.__DEFAULT_KEY_REQUIRED)
            
            # Process child element at specified index
            logging.info("Child index: %d", index)
            result = self.__generate_child_process(child_command, elements[index])
            
            # If no subelements were found, return that element or element list
            # If a minor error occured (e.g. element not found), return False
            if result is not True:
                return result
        
        # If everything was fine, return True
        return True
    
    def __generate_child_process(self, child_command, child_element):
        return TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                       child_command, child_element).process()
    
    ###############################################################################################
    
    def _process_split(self):
        # Get text to split
        parent_text = self.__get_parent_text()
        
        # Get text separator
        delimiter = self.__parent_command.get("delimiter")
        if delimiter is None:
            raise TrackingScraperError("No separator found")
        
        # Split text
        elements = parent_text.split(delimiter)
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return split list
        return items
    
    def __get_parent_text(self):
        text = self.__parent_element
        try:
            return saved_text.text.strip() # value is a DOM element, we need its inner text
        except AttributeError:
            return saved_text.strip() # value is already a string
    
    ###############################################################################################
    
    def _process_regex(self):
        # Get text
        text = self.__get_parent_text()
        
        # Get regular expression pattern
        pattern = self.__parent_command.get("pattern")
        if pattern is None:
            raise TrackingScraperError("No regular expression found")
        
        # Match expression with text
        regex = re.search(pattern, text)
        if regex is None:
            logging.info("Regular expression does not match text, using required")
            return not self.__parent_command.get("required", self.__DEFAULT_KEY_REQUIRED)
        
        # Get list of matched elements
        elements = list(regex.groups())
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return list of matched elements
        return elements
    
    ###############################################################################################
    
    def _process_save(self):
        attribute = self.__parent_command.get("key")
        if attribute is None:
            raise TrackingScraperError("Save key not found")
        
        # Get text to be saved, and verify if it's not empty
        text = self.__get_parent_text()
        if len(text) == 0:
            logging.info("Text to save is empty, using required")
            return not self.__parent_command.get("required", self.__DEFAULT_KEY_REQUIRED)
        
        # Save according to parent key and formatting value
        saved_value = self.__get_save_data(text, self.__parent_command("format"))
        self.__document[key] = saved_value
        
        # Return True to indicate everything was OK
        return True
    
    def __get_save_data(self, text, format_type):
        # Convert to integer
        if format_type == "int":
            try:
                return int(text.replace(",", ""))
            except ValueError:
                logging.info("Text could not be processed to integer, resorting to text")
                return text
        
        # Converting to floating-point value
        elif format_type in ["float", "double"]:
            try:
                return float(text, replace(",", ""))
            except ValueError:
                logging.info("Text could not be processed to float, resorting to text")
                return text
        
        # Convert to date object
        elif format_type == "date":
            # Get pattern
            pattern = self.__configuration["general"].get("date")
            if pattern is None:
                raise TrackingScraperError("No date patterns found")
            # Convert depending on specified type
            try:
                return datetime.date.strptime(text, pattern)
            except ValueError:
                logging.info("Text could not be processed to date, resorting to text")
                return text
        
        # Convert to time object
        elif format_type == "time":
            # Get pattern
            pattern = self.__configuration["general"].get("time")
            if pattern is None:
                raise TrackingScraperError("No time patterns found")
            # Convert depending on specified type
            try:
                return datetime.time.strptime(text, pattern)
            except ValueError:
                logging.info("Text could not be processed to time, resorting to text")
                return text
        
        # Convert to datetime object
        elif format_type == "datetime":
            # Get pattern
            pattern = self.__configuration["general"].get("datetime")
            if pattern is None:
                raise TrackingScraperError("No datetime patterns found")
            # Convert depending on specified type
            try:
                return datetime.datetime.strptime(text, pattern)
            except ValueError:
                logging.info("Text could not be processed to datetime, resorting to text")
                return text
        
        # If format type is not recognized, return simple text
        else:
            logging.info("Format type " + format_type + " not found, resorting to text")
            return text
    
    ###############################################################################################
    
    def _process_compare(self):
        # Get text to compare
        text = self.__get_parent_text()
        
        # Get values to compare
        values = self.__parent_command.get("values")
        if values is not None:
            raise TrackingScraperError("Values to compare not found")
        
        # Check if text equals to value, or if it is in value list, then act accordingly
        if text in values:
            commands = self.__parent_command.get("success")
            return self.__process_compare_commands(commands, "Success")
        else:
            commands = self.__parent_command.get("failure")
            return self.__process_compare_commands(commands, "Failure")
    
    def __process_compare_commands(self, child_commands):
        # Check requirements
        if commands is None:
            logging.info(compare_result + " commands not found, resorting to required")
            return not self.__parent_command.get("required", self.__DEFAULT_KEY_REQUIRED)
        
        # Process child commands
        for child_command in commands:
            result = self.__generate_child_process(child_command, self.__parent_element)
            if result is not True:
                return result
        return True
    
    ###############################################################################################
    
    def _process_write(self):
        # Get value
        value = self.__parent_command.get("value")
        if value is None:
            # Get value from attribute
            attribute = self.__parent_command.get("attribute")
            if attribute is None:
                raise TrackingScraperError("No value or attribute to use as input")
            value = self.__document.get(attribute)
        
        try:
            # Clear element if specified
            if self.__parent_command.get("clean", self.__DEFAULT_KEY_CLEAN):
                self.__parent_element.clear()
            # Write value
            self.__parent_element.send_keys(value)
            # Send enter if specified
            if self.__parent_command.get("enter", self.__DEFAULT_KEY_ENTER):
                self.__parent_element.send_keys(Keys.ENTER)
        except AttributeError:
            raise TrackingScraperError("Element is not interactable (attribute)")
        except ElementNotInteractableException:
            raise TrackingScraperError("Element is not interactable (selenium)")
        
        # Return True to indicate everything is OK
        return True
    
    ###############################################################################################
    
    def _process_alert(self):
        assertion = self.__parent_command.get("assertion")
        try:
            # Try to switch to alert
            alert = self.__driver.switch_to.alert
            if assertion is False:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly found")
            # Accept or dismiss action depending on command
            if self.__parent_command.get("action", self.__DEFAULT_KEY_ACTION):
                alert.accept()
            else:
                alert.dismiss()
        except NoAlertPresentException:
            if assertion is True:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly not found")
        
        # Return True to indicate everything is OK
        return True
    
    ###############################################################################################
    
    def _process_click(self):
        # Check requirements
        required = self.__parent_command.get("required", self.__DEFAULT_KEY_REQUIRED)
        if not self.__parent_element.is_displayed():
            return not required
        
        try:
            # Try to click the element
            self.__parent_element.click()
            # Wait 2 or 5 seconds depending on "wait" attribute
            wait_time = self.__parent_command.get("wait", self.__DEFAULT_KEY_WAIT)
            time.sleep(self.__DEFAULT_WAIT_LONG if wait_time else self.__DEFAULT_WAIT_SHORT)
            # Return True to indicate everything is OK
            return True
        except ElementNotInteractableException:
            return not is_required
    
    ###############################################################################################
    
    def _process_ocr(self):
        length = self.__parent_command.get("length")
        if length is None:
            raise TrackingScraperError("Text length not defined")
        
        # Request text
        text = input("Enter captcha text: ")
        if len(text) != length:
            raise TrackingScraperError("Text is not " + length + " characters long")
        
        # Save to attribute
        self.__document["ocr"] = text
        return True

## Converter class

In [None]:
class TrackingScraperConverter:
    """Utility class to convert text to other Python types and classes."""
    
    def __init__(self, convertion_text, convertion_type):
        self.__convertion_text = convertion_text
        self.__convertion_type = convertion_type
    
    def convert(self):
        return self.__convertion_text
    
    def _convert_int(self):
        pass
    
    def _convert_float(self):
        pass
    
    def _convert_date(self):
        pass
    
    def _convert_time(self):
        pass
    
    def _convert_datetime(self):
        pass

## Main scraper class

In [5]:
class TrackingScraper:
    """
    Main scraper class.
    """
    
    __DEFAULT_TIMEOUT = 60
    __DEFAULT_WAIT    = 5
    
    def __init__(self, document, configuration):
        self.__document      = document
        self.__configuration = configuration
        
        # Initialize WebDriver
        try:
            self.__driver = webdriver.Firefox(executable_path = "../driver/geckodriver")
        except WebDriverException as ex:
            raise TrackingScraperError("Error creating Selenium driver. " + str(ex))
    
    ###############################################################################################
    
    def execute(self):
        parent_result   = False
        input_result    = False
        single_result   = False
        multiple_result = False
        
        try:
            self.__go_to_url()
            start = time.time()
            
            while True:
                # Check if we're still on time
                end = time.time()
                if (end - start) > self.__DEFAULT_TIMEOUT:
                    raise TrackingScraperError("Timeout exceeded, scraping was unsuccessful")
                
                # Execute input
                input_result = self._execute_input(self, input_result)
                if input_result is not True:
                    logging.info("Input execution was unsuccessful, retrying...")
                    continue
                
                # Execute single output
                single_result = self._execute_single_output(self, single_result)
                if single_result is not True:
                    logging.info("Single output execution was unsuccessful, retrying...")
                    continue
                
                # Execute multiple output
                break
            
            # Return True if everything executed correctly
            parent_result = True
        
        # Exception handling
        except TrackingScraperError:
            logging.exception("Error occured while executing scraper")
        except Exception:
            logging.exception("Unknown exception occured")
        finally:
            self.__driver.close()
            return parent_result
    
    ###############################################################################################
    
    def _go_to_url(self):
        try:
            link = self.__configuration["general"]["url"]
            self.__driver.get(link.format(**container))
            time.sleep(self.__DEFAULT_WAIT)
        except KeyError:
            raise TrackingScraperException("Configuration URL could not be found")
        except TimeoutException:
            raise TrackingScraperException("Error loading Web page, timeout exceeded")
    
    ###############################################################################################
    
    def _execute_input(self, input_result):
        # Check if input was already executed
        if input_result is True:
            return True
        
        # Get input commands, if none found, return True
        input_commands = self.__configuration["input"]
        if input_commands is None:
            return True
        
        # Process parent input commands
        for input_command in input_commands:
            result = TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                             input_command).process()
            if result is not True:
                return result
        
        # Return True if everything was OK
        return True
    
    ###############################################################################################
    
    def __execute_single_output(self, single_result):
        # Check if single output was already executed
        if single_result is True:
            return True
        
        # Get output commands, if none found, return True
        single_commands = self.__configuration["single"]
        if single_commands is None:
            return True
        
        # Process parent input commands
        for single_command in input_commands:
            result = TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                             single_command).process()
            if result is not True:
                return result
        
        # Return True if everything was OK
        return True

## Tests

In [6]:
with open("../config/Hapag-Lloyd.json") as file:
    config = json.load(file)

In [7]:
container = {
    "year": "2019",
    "manifest": "450",
    "detail": "874",
    "container": "HLXU5183586",
    "carrier": "Hapag-Lloyd"
}