In [2]:
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By

from datetime import *
from pymongo import MongoClient

import json
import logging
import re
import time
import sys

In [3]:
logging.basicConfig(filename = "../logs/scraper-20190325.log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["tracking_scraper"]

## Exception class

In [4]:
class TrackingScraperError(Exception):
    pass

## Selector class

In [6]:
class TrackingScraperSwitcher:
    """
    Switcher for selecting and saving Web elements and subelements in a tracking-related document.
    """
    
    __DEFAULT_ACTION   = True
    __DEFAULT_REQUIRED = True
    __DEFAULT_WAIT     = True
    
    def __init__(self, driver, document, configuration, parent_command, parent_element = None):
        self.__driver         = driver
        self.__document       = document
        self.__configuration  = configuration
        self.__parent_command = parent_command
        self.__parent_element = driver if (parent_element is None) else parent_element
    
    @property
    def document(self):
        """Returns the stored tracking-related dictionary."""
        return self.__document
    
    ###############################################################################################
    
    def process(self):
        """
        Get Web elements based on the current configuration command, then process or return them
        accordingly. Returns True if all commands and subcommands were executed successfully,
        False if one command failed, or the list of Web elements if no subcommands were found.
        """
        # Get process type
        process_type = self.__parent_command.get("type")
        if process_type is None:
            raise TrackingScraperError("Process type not found")
        logging.info("Process type: %s", process_type)
        
        # Execute process based on process type
        try:
            method = getattr(self, "_process_" + process_type)
            return method()
        except AttributeError:
            raise TrackingScraperError("Process type " + process_type + " is not valid")
        except TypeError:
            raise TrackingScraperError("Process type " + process_type + " cannot be invoked")
    
    ###############################################################################################
    
    def _process_id(self):
        return self._process_dom_elements(By.ID)
    def _process_class(self):
        return self._process_dom_elements(By.CLASS_NAME)
    def _process_css(self):
        return self._process_dom_elements(By.CSS_SELECTOR)
    def _process_name(self):
        return self._process_dom_elements(By.NAME)
    def _process_tag(self):
        return self._process_dom_elements(By.TAG_NAME)
    def _process_xpath(self):
        return self._process_dom_elements(By.XPATH)
    
    def _process_dom_elements(self, selector_type):
        # Get selector
        selector = self.__parent_command.get("selector")
        if selector is None:
            raise TrackingScraperError("Selector not found in process by " + selector_type)
        
        # Get DOM elements depending on assertion
        assertion = self.__parent_command.get("assert")
        if assertion is True:
            pass
        dom_elements = self.__parent_element.find_elements(selector_type, selector)
        
        # Check assertions
        if (assertion is True) and (len(dom_elements) == 0): # Assert at least one element found
            raise TrackingScraperError("Assertion error: Elements unexpectedly not found")
        if (assertion is False) and (len(dom_elements) > 0): # Assert no elements found
            raise TrackingScraperError("Assertion error: Elements unexpectedly found")
        
        # Check requirements
        required = self.__parent_command.get("required", self.__DEFAULT_REQUIRED)
        if (assertion is not None) and (assertion != required):
            raise TrackingScraperError("Assertion value and required value cannot be opposites")
        if len(dom_elements) == 0:
            logging.info("No elements found, using required")
            return not required
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, dom_elements)
        
        # Get a child command for all elements and process them, if possible
        child_command = self.__parent_command.get("command")
        if child_command is not None:
            for child_element in dom_elements:
                result = self.__generate_child_process(child_command, child_element)
                if result is not True:
                    return result
            return True
        
        # If no single child command was found, return all DOM elements
        return dom_elements
    
    def __process_child_commands(self, commands, elements):
        for child_command in commands:
            # Get index
            index = child_command.get("index")
            if index is None:
                raise TrackingScraperError("Child index command not found")
            
            # Check requirements
            if index >= len(elements):
                logging.info("Child element at index " + index + ", using required")
                return not child_command.get("required", self.__DEFAULT_REQUIRED)
            
            # Process child element at specified index
            logging.info("Child index: %d", index)
            result = self.__generate_child_process(child_command, elements[index])
            
            # If no subelements were found, return that element or element list
            # If a minor error occured (e.g. element not found), return False
            if result is not True:
                return result
        
        # If everything was fine, return True
        return True
    
    def __generate_child_process(self, child_command, child_element):
        return TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                       child_command, child_element).process()
    
    ###############################################################################################
    
    def _process_split(self):
        # Get text to split
        parent_text = self.__get_parent_text()
        
        # Get text separator
        delimiter = self.__parent_command.get("delimiter")
        if delimiter is None:
            raise TrackingScraperError("No separator found")
        
        # Split text
        elements = parent_text.split(delimiter)
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return split list
        return items
    
    def _process_regex(self):
        # Get text
        text = self.__get_parent_text()
        
        # Get regular expression pattern
        pattern = self.__parent_command.get("pattern")
        if pattern is None:
            raise TrackingScraperError("No regular expression found")
        
        # Match expression with text
        regex = re.search(pattern, text)
        if regex is None:
            logging.info("Regular expression does not match text, using required")
            return not self.__parent_command.get("required", self.__DEFAULT_REQUIRED)
        
        # Get list of matched elements
        elements = list(regex.groups())
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return list of matched elements
        return elements
    
    def _process_save(self):
        attribute = self.__parent_command.get("key")
        if attribute is None:
            raise TrackingScraperError("Save key not found")
        
        # Get text to be saved, and verify if it's not empty
        text = self.__get_parent_text()
        if len(text) == 0:
            logging.info("Text to save is empty, using required")
            return not self.__parent_command("required", self.__DEFAULT_REQUIRED)
        
        # Get text as datetime object
        format_type = self.__parent_command("format")
        saved_value = text if (format_type is None) else self.__get_save_data(text, format_type)
        
        # Save according to parent key
        self.__document[key] = saved_value
        return True
    
    def __get_parent_text(self):
        text = self.__parent_element
        try:
            return saved_text.text.strip() # value is a DOM element, we need its inner text
        except AttributeError:
            return saved_text.strip() # value is already a string
    
    def __get_save_data(self, text, format_type):
        # Convert to datetime object
        if format_type in ["date", "time", "datetime"]:
            # Get pattern
            pattern = self.__configuration["general"].get(format_type)
            if pattern is None:
                raise TrackingScraperError("Date pattern not found")
            # Convert depending on specified type
            try:
                return globals()[format_type].strptime(text, pattern)
            except ValueError:
                logging.info("Text could not be processed to " + format_type + ", resorting to text")
                return text
        
        # Convert to integer
        if format_type == "int":
            try:
                return int(text.replace(",", ""))
            except ValueError:
                logging.info("Text could not be processed to integer, resorting to text")
                return text
        
        # Converting to floating-point value
        if format_type in ["float", "double"]:
            try:
                return float(text, replace(",", ""))
            except ValueError:
                logging.info("Text could not be processed to float, resorting to text")
                return text
        
        # If format type is not recognized, stay in text
        logging.info("Format type " + format_type + " not found, resorting to text")
        return text
    
    ###############################################################################################
    
    def _process_write(self):
        # Get value
        value = self.__parent_command.get("value")
        if value is None:
            # Get attribute
            attribute = self.__parent_command.get("attribute")
            if attribute is None:
                pass
            # Get value from attribute
            value = self.__document.get(attribute)
        
        # Write
        return True
    
    ###############################################################################################
    
    def _process_alert(self):
        assertion = self.__parent_command.get("assertion")
        
        try:
            alert = self.__driver.switch_to.alert
            if assertion is False:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly found")
            # Accept or dismiss action depending on command
            if self.__parent_command.get("action", self.__DEFAULT_ACTION):
                alert.accept()
            else:
                alert.dismiss()
        except NoAlertPresentException:
            if assertion is True:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly not found")
        
        return True
    
    def _process_click(self):
        required = self.__parent_command.get("required", self.__DEFAULT_REQUIRED)
        
        # Check if element is visible
        if not self.__parent_element.is_displayed():
            return not required
        
        # Wait 2 or 5 seconds depending on "wait" attribute
        wait_time = 5 if self.__parent_command.get("wait", self.__DEFAULT_WAIT) else 2
        
        # Click the element
        try:
            self.__parent_element.click()
            time.sleep(wait_time)
            return True
        except ElementNotInteractableException:
            return not is_required
    
    def _process_compare(self):
        print("COMPARE NOT IMPLEMENTED YET")
        return True
    
    def _process_ocr(self):
        length = self.__parent_command.get("length")
        if length is None:
            raise TrackingScraperError("Text length not defined")
        
        # Request text
        text = input("Enter captcha text: ")
        if len(text) != length:
            raise TrackingScraperError("Text is not " + length + " characters long")
        
        # Save to attribute
        self.__document["ocr"] = text
        return True

In [8]:
with open("../config/Hapag-Lloyd.json") as file:
    config = json.load(file)

In [11]:
container = {
    "year": "2019",
    "manifest": "450",
    "detail": "874",
    "container": "HLXU5183586",
    "carrier": "Hapag-Lloyd"
}

In [9]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [12]:
driver = webdriver.Firefox(executable_path = "../driver/geckodriver")
driver.get(config["general"]["url"].format(**container))
rows = WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".hal-table tbody tr")))
print(len(rows))

8


In [46]:
def execute_input_commands():
    input_commands = config["input"]
    for command in input_commands:
        result = TrackingScraperSwitcher(driver, container, config, command).process()
        if result is not True:
            return result
    return True

True
True


In [10]:
driver = webdriver.Firefox(executable_path = "../driver/geckodriver")
for command in config["input"]:
    TrackingScraperSwitcher(driver, container, command).process()

class name
id


In [1]:
driver.close()
# InvalidSessionIdException
# WebDriverException

NameError: name 'driver' is not defined

## Main scraper class

In [None]:
class TrackingScraper:
    """
    Main scraper class.
    """
    def __init__(self, metadata):
        self.__metadata = metadata
        try:
            self.__driver = webdriver.Firefox(executable_path = "../driver/geckodriver")
        except WebDriverException as ex:
            raise TrackingScraperError("Error creating Selenium driver")
    
    def execute(self):
        try:
            self._go_to_page(self)
    
    def _go_to_page(self):
    
    def _execute_input(self):
        pass
    
    def _execute_output(self):
        pass
    
    def __close_driver(self, message):
        self.__driver.close()
        raise TrackingScraperError(message)

In [None]:
containers = []