# Tracking Scraper

In [1]:
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pymongo import MongoClient

import datetime
import json
import logging
import re
import time
import sys

In [2]:
today = datetime.datetime.now().strftime("%Y%m%d")
logging.basicConfig(filename = "../logs/scraper-" + today + ".log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["tracking_scraper"]
container_table = database["containers"]

## Exception class and constants

In [3]:
class TrackingScraperError(Exception):
    """Custom exception for the Tracking Web Scraper."""
    pass

class TrackingScraperConfig:
    """Constants and basic configuration for the Tracking Web Scraper."""
    
    # Default executable path for the Google Chrome webdriver
    DEFAULT_PATH_CHROME     = "../driver/chromedriver"
    # Default executable path for the Firefox webdriver
    DEFAULT_PATH_FIREFOX    = "../driver/geckodriver"
    
    # Default timeout for short processing, in seconds
    DEFAULT_TIMEOUT         = 30
    # Default timeout for long processing, in seconds
    DEFAULT_TIMEOUT_LONG    = 90
    # Default wait for long actions, in seconds
    DEFAULT_WAIT_LONG       = 5
    # Default wait for short actions, in seconds
    DEFAULT_WAIT_SHORT      = 1.5
    
    # Default value for the key "required" in all types
    DEFAULT_KEY_REQUIRED    = True
    # Default value for the key "action" in type "alert"
    DEFAULT_KEY_ACTION      = True
    # Default value for the key "wait" in type "click"
    DEFAULT_KEY_WAIT        = True
    # Default value for the key "clean" in type "write"
    DEFAULT_KEY_CLEAN       = False
    # Default value for the key "enter" in type "write"
    DEFAULT_KEY_ENTER       = False
    # Default value for the key "overwrite" in multiple configuration
    DEFAULT_KEY_OVERWRITE   = False
    # Default value for the key "frame" in selector types
    DEFAULT_KEY_FRAME       = False
    
    # Default value for the key "processed" in upserting container info
    DEFAULT_KEY_PROCESSED   = True
    # Default value for the key "estimated" in container movements
    DEFAULT_KEY_ESTIMATED   = True
    
    # Default thousand separator symbol
    DEFAULT_THOUSAND_SYMBOL = ","
    # Default datetime locale information
    DEFAULT_DATETIME_LOCALE = {
        "hours": -5
    }

## Selector class

In [4]:
class TrackingScraperSwitcher:
    """
    Switcher for selecting and saving Web elements and subelements in a tracking-related document.
    """
    
    def __init__(self, driver, document, configuration, parent_command, parent_element = None):
        self.__driver         = driver
        self.__document       = document
        self.__configuration  = configuration
        self.__parent_command = parent_command
        self.__parent_element = driver if parent_element is None else parent_element
    
    @property
    def document(self):
        """Returns the stored tracking-related dictionary."""
        return self.__document
    
    ###############################################################################################
    
    def process(self):
        """
        Get Web elements based on the current configuration command, then process or return them
        accordingly. Returns True if all commands and subcommands were executed successfully,
        False if one command failed, or the list of Web elements if no subcommands were found.
        """
        # Get process type
        process_type = self.__parent_command.get("type")
        if process_type is None:
            raise TrackingScraperError("Process type not found")
        logging.info("Process type: %s", process_type)
        
        # Execute process based on process type
        try:
            method = getattr(self, "_process_" + process_type)
            return method()
        except AttributeError:
            raise TrackingScraperError("Process type " + process_type + " is not valid")
        except TypeError:
            raise TrackingScraperError("Process type " + process_type + " can't be directly invoked")
    
    ###############################################################################################
    
    def _process_id(self):
        return self.__process_dom_elements(By.ID)
    def _process_class(self):
        return self.__process_dom_elements(By.CLASS_NAME)
    def _process_css(self):
        return self.__process_dom_elements(By.CSS_SELECTOR)
    def _process_name(self):
        return self.__process_dom_elements(By.NAME)
    def _process_tag(self):
        return self.__process_dom_elements(By.TAG_NAME)
    def _process_xpath(self):
        return self.__process_dom_elements(By.XPATH)
    
    def __process_dom_elements(self, selector_type):
        # Get selector
        selector = self.__parent_command.get("selector")
        if selector is None:
            raise TrackingScraperError("Selector not found in process by " + selector_type)
        
        # Check assertions
        assertions = self.__check_assertions(selector_type, selector)
        if assertions is True:
            logging.info("Assertions are correct")
            return True
        
        # Get DOM elements
        dom_elements = self.__parent_element.find_elements(selector_type, selector)
        
        # Check requirements
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if len(dom_elements) == 0:
            logging.info("No elements found, using required")
            return not required
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, dom_elements)
        
        # Get a child command for all elements and process them, if possible
        child_command = self.__parent_command.get("command")
        if isinstance(child_command, dict):
            for child_element in dom_elements:
                result = self.__generate_child_process(child_command, child_element)
                if result is not True:
                    return result
            return True
        
        # If no single child command was found, return all DOM elements
        logging.info("No commands found, return all elements")
        return dom_elements
    
    def __check_assertions(self, selector_type, selector):
        assertion = self.__parent_command.get("assert")
        
        if isinstance(assertion, bool):
            # Set expected conditions depending if we want to switch to a frame or not
            frame = self.__parent_command.get("frame", TrackingScraperConfig.DEFAULT_KEY_FRAME)
            if frame:
                conditions = EC.frame_to_be_available_and_switch_to_it((selector_type, selector))
            else:
                conditions = EC.presence_of_all_elements_located((selector_type, selector))
            
            # Prepare waiter
            waiter = WebDriverWait(self.__driver, TrackingScraperConfig.DEFAULT_TIMEOUT)
            
            if assertion:
                # Assert at least one element found
                try:
                    waiter.until(conditions)
                except TimeoutException:
                    raise TrackingScraperError("Assertion error: Elements unexpectedly not found")
            else:
                # Assert no elements found
                try:
                    waiter.until_not(conditions)
                except TimeoutException:
                    raise TrackingScraperError("Assertion error: Elements unexpectedly found")
            
            # Wait a little bit and return
            time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
            return True
        
        logging.info("Assertions not found, selector: %s by %s", selector, selector_type)
        return False
    
    def __process_child_commands(self, commands, elements):
        for child_command in commands:
            # Get index
            index = child_command.get("index")
            if index is None:
                raise TrackingScraperError("Child index command not found")
            
            # Check requirements
            if index >= len(elements):
                logging.info("Child element at index %d, using required", index)
                return not child_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
            
            # Process child element at specified index
            logging.info("Child index: %d", index)
            result = self.__generate_child_process(child_command, elements[index])
            
            # If no subelements were found, return that element or element list
            # If a minor error occured (e.g. element not found), return False
            if result is not True:
                return result
        
        # If everything was fine, return True
        return True
    
    def __generate_child_process(self, child_command, child_element):
        return TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                       child_command, child_element).process()
    
    ###############################################################################################
    
    def _process_split(self):
        # Get text to split
        parent_text = self.__get_parent_text()
        
        # Get text separator
        delimiter = self.__parent_command.get("delimiter")
        if delimiter is None:
            raise TrackingScraperError("No separator found")
        
        # Split text
        elements = parent_text.split(delimiter)
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return split list
        return elements
    
    def __get_parent_text(self):
        parent_text = self.__parent_element
        try:
            return parent_text.text.strip() # value is a DOM element, we need its inner text
        except AttributeError:
            return parent_text.strip() # value is already a string
    
    ###############################################################################################
    
    def _process_regex(self):
        # Get text
        text = self.__get_parent_text()
        
        # Get regular expression pattern
        pattern = self.__parent_command.get("pattern")
        if pattern is None:
            raise TrackingScraperError("No regular expression found")
        
        # Match expression with text
        regex    = re.search(pattern, text)
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if regex is None:
            logging.info("Regular expression does not match text, using required")
            return not required
        
        # Get list of matched elements
        elements = list(regex.groups())
        
        # Get child command list and process them, if possible
        commands = self.__parent_command.get("commands")
        if isinstance(commands, list):
            return self.__process_child_commands(commands, elements)
        
        # If no single child command was found, return list of matched elements
        return elements
    
    ###############################################################################################
    
    def _process_save(self):
        attribute = self.__parent_command.get("key")
        if attribute is None:
            raise TrackingScraperError("Save key not found")
        
        # If a value was already defined, save it and exit
        value = self.__parent_command.get("value")
        if value is not None:
            self.__document[attribute] = value
            return True
        
        # Get text to be saved, and verify if it's not empty
        value    = self.__get_parent_text()
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if len(value) == 0:
            logging.info("Text to save is empty, using required")
            return not required
        
        # Format type if necessary
        format_type = self.__parent_command.get("format")
        if format_type is not None:
            value = TrackingScraperConverter(value, format_type, self.__configuration).convert()
            # If a value already exists in the attribute and it's a datetime object, join them
            if self.__join_datetimes_if_possible(attribute, value):
                return True
            
        # Save according to parent key and formatting value
        self.__document[attribute] = value
        return True
    
    def __join_datetimes_if_possible(self, attribute, new_value):
        # Check if attribute exists
        if attribute not in self.__document:
            return False
        
        # Get value from attribute
        old_value = self.__document[attribute]
        
        # Check if old value is a date and new value is a time
        if isinstance(old_value, datetime.datetime) and isinstance(new_value, datetime.time):
            self.__document[attribute] = datetime.datetime.combine(old_value.date(), new_value)
            return True
        
        # Return False if nothing was found
        return False
    
    ###############################################################################################
    
    def _process_attr(self):
        # Get attribute name
        attribute_name = self.__parent_command.get("name")
        if attribute_name is None:
            raise TrackingScraperError("Attribute name not found")
        
        # Get attribute value from parent element
        attribute = self.__parent_element.get_attribute(attribute_name)
        
        # Get child command, if none found, return attribute
        child_command = self.__parent_command.get("command")
        if child_command is not None:
            logging.info("ATTRIBUTE - Child command found")
            return TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                           child_command, attribute).process()
        print(attribute)
        logging.info("ATTRIBUTE - No child command found")
        return attribute
    
    ###############################################################################################
    
    def _process_compare(self):
        # Get text to compare
        text = self.__get_parent_text()
        
        # Get values to compare
        values = self.__parent_command.get("values")
        if values is None:
            raise TrackingScraperError("Values to compare not found")
        
        # Check if text equals to value, or if it is in value list, then act accordingly
        if text in values:
            commands = self.__parent_command.get("success")
            return self.__process_compare_commands(commands, "Success")
        else:
            commands = self.__parent_command.get("failure")
            return self.__process_compare_commands(commands, "Failure")
    
    def __process_compare_commands(self, commands, compare_result):
        # Check requirements
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if commands is None:
            logging.info(compare_result + " commands not found, resorting to required")
            return not required
        
        # Process child commands
        for child_command in commands:
            result = self.__generate_child_process(child_command, self.__parent_element)
            if result is not True:
                return result
        return True
    
    ###############################################################################################
    
    def _process_write(self):
        # Get value
        value = self.__parent_command.get("value")
        if value is None:
            # Get value from attribute
            attribute = self.__parent_command.get("attribute")
            if attribute is None:
                raise TrackingScraperError("No value or attribute to use as input")
            value = self.__document.get(attribute)
        
        try:
            # Clear element if specified
            if self.__parent_command.get("clean", TrackingScraperConfig.DEFAULT_KEY_CLEAN):
                self.__parent_element.clear()
            # Write value
            self.__parent_element.send_keys(value)
            # Send enter if specified
            if self.__parent_command.get("enter", TrackingScraperConfig.DEFAULT_KEY_ENTER):
                self.__parent_element.send_keys(Keys.ENTER)
        except AttributeError:
            raise TrackingScraperError("Element is not interactable (attribute)")
        except ElementNotInteractableException:
            raise TrackingScraperError("Element is not interactable (selenium)")
        
        # Return True to indicate everything is OK
        time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
        return True
    
    ###############################################################################################
    
    def _process_alert(self):
        assertion = self.__parent_command.get("assertion")
        # TODO: Usar waits
        try:
            # Try to switch to alert
            alert = self.__driver.switch_to.alert
            if assertion is False:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly found")
            # Accept or dismiss action depending on command
            if self.__parent_command.get("action", TrackingScraperConfig.DEFAULT_KEY_ACTION):
                alert.accept()
            else:
                alert.dismiss()
        except NoAlertPresentException:
            if assertion is True:
                raise TrackingScraperError("Assertion failed: Alert unexpectedly not found")
        
        # Return True to indicate everything is OK
        return True
    
    ###############################################################################################
    
    def _process_click(self):
        # Check requirements
        required = self.__parent_command.get("required", TrackingScraperConfig.DEFAULT_KEY_REQUIRED)
        if not self.__parent_element.is_displayed():
            return not required
        if not self.__parent_element.is_enabled():
            return not required
        
        try:
            # Try to click the element
            self.__parent_element.click()
            
            # Wait 2 or 5 seconds depending on "wait" attribute
            wait_time = self.__parent_command.get("wait", TrackingScraperConfig.DEFAULT_KEY_WAIT)
            if wait_time:
                time.sleep(TrackingScraperConfig.DEFAULT_WAIT_LONG)
            else:
                time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
                
            # Return True to indicate everything is OK
            return True
        except ElementNotInteractableException:
            return not required
    
    ###############################################################################################
    
    def _process_ocr(self):
        length = self.__parent_command.get("length")
        if length is None:
            raise TrackingScraperError("Text length not defined")
        
        # Request text
        text = input("Enter captcha text: ")
        if len(text) != length:
            raise TrackingScraperError("Text is not " + str(length) + " characters long")
        
        # Save to attribute
        self.__document["ocr"] = text
        return True

## Converter class

In [5]:
class TrackingScraperConverter:
    """Utility class to convert text to other Python types."""
    
    def __init__(self, raw_text, format_type, configuration):
        self.__raw_text      = raw_text
        self.__format_type   = format_type
        self.__configuration = configuration
    
    def convert(self):
        """Try to convert to the desired type, if none found, return text as-is."""
        try:
            method = getattr(self, "_convert_to_" + self.__format_type)
            return method()
        except AttributeError:
            logging.info("Convertion to " + self.__format_type + " not supported, resorting to text")
            return self.__raw_text
        except TypeError:
            raise TrackingScraperError("Convertion to " + self.__format_type + " cannot be invoked")
    
    def _convert_to_int(self):
        """Convert text to an integer."""
        try:
            return int(self.__raw_text.replace(TrackingScraperConfig.DEFAULT_THOUSAND_SYMBOL, ""))
        except ValueError:
            logging.info("Convertion to integer failed, resorting to text")
            return self.__raw_text
    
    def _convert_to_float(self):
        """Convert text to a double-precision floating-point number."""
        try:
            return float(self.__raw_text.replace(TrackingScraperConfig.DEFAULT_THOUSAND_SYMBOL, ""))
        except ValueError:
            logging.info("Convertion to float failed, resorting to text")
            return self.__raw_text
    
    def _convert_to_double(self):
        # Alias for self._convert_to_float().
        return self._convert_to_float()
    
    def _convert_to_date(self):
        """Convert text to a Python datetime object."""
        # Get datetime patterns
        try:
            patterns = self.__configuration["general"]["date_formats"]
        except KeyError:
            logging.info("Datetime patterns not found, resorting to text")
            return self.__raw_text
        
        # Try each pattern until it matches one
        for pattern in patterns:
            try:
                return datetime.datetime.strptime(self.__raw_text, pattern)
            except ValueError:
                continue
        
        # If none of the patterns matched, return text as-is
        logging.info("None of the patterns matched, resorting to text")
        return self.__raw_text
    
    def _convert_to_datetime(self):
        return self._convert_to_date()
    
    def _convert_to_time(self):
        """Convert text to a Python time object."""
        value = self._convert_to_date()
        if isinstance(value, datetime.datetime):
            return value.time()
        return value
    
    def _convert_to_datelocal(self):
        """Convert text to a Python datetime object taking the defined locale into account."""
        value = self._convert_to_date()
        if isinstance(value, datetime.datetime):
            return value - datetime.timedelta(**TrackingScraperConfig.DEFAULT_DATETIME_LOCALE)
        return value
    
    def _convert_to_timelocal(self):
        """Convert text to a Python time object taking the defined locale into account."""
        value = self._convert_to_datelocal()
        if isinstance(value, datetime.datetime):
            return value.time()
        return value
    
    def _convert_to_status(self):
        """Convert text to a tracking status based on the configuration for translation."""
        # TO-DO
        return self.__raw_text

## Main scraper class

In [6]:
class TrackingScraper:
    """Main class for the Tracking Web Scraper."""
    
    def __init__(self, driver, database, document):
        self.__driver   = driver
        self.__database = database
        self.__document = document
        
        # Get configuration file
        try:
            with open("../config/" + self.__document["carrier"] + ".json") as file:
                self.__configuration = json.load(file)
        except KeyError:
            raise TrackingScraperError("Carrier not found")
        except FileNotFoundError:
            raise TrackingScraperError("Configuration file not found")
        
        # Get general configuration
        if "general" not in self.__configuration:
            raise TrackingScraperError("General configuration information not found")
        
        # Get single and multiple tables
        self.__single_table, self.__single_query     = self._get_database_config(database, "single")
        self.__multiple_table, self.__multiple_query = self._get_database_config(database, "multiple")
    
    def _get_database_config(self, database, config_type):
        # Get configuration
        collection_configuration = self.__configuration["general"].get(config_type)
        if collection_configuration is None:
            return None

        # Get collection name
        table_name = collection_configuration.get("table")
        if table_name is None:
            raise TrackingScraperError("Table name for " + config_type + " entries not found")
        
        # Get collection query
        table_query_keys = collection_configuration.get("query")
        if not isinstance(table_query_keys, list):
            table_query_keys = []
        
        # Return database and query keys
        return database[table_name], table_query_keys
        
    @property
    def document(self):
        """Returns the container information."""
        return self.__document
    
    ###############################################################################################
    
    def execute(self):
        """Execute commands."""
        
        parent_result   = False
        input_result    = False
        single_result   = False
        multiple_result = False
        
        try:
            start = self._go_to_url()
            while True:
                # Check if we're still on time
                end = time.time()
                if (end - start) > TrackingScraperConfig.DEFAULT_TIMEOUT:
                    raise TrackingScraperError("Timeout exceeded, scraping was unsuccessful")
                
                # Execute input
                input_result = self._execute_commands(input_result, "input")
                if input_result is not True:
                    logging.info("Input execution was unsuccessful, retrying...")
                    continue
                
                # Execute single output
                single_result = self._execute_commands(single_result, "single")
                if single_result is not True:
                    logging.info("Single output execution was unsuccessful, retrying...")
                    continue
                
                # Execute multiple output
                multiple_result = self._execute_multiple_output(multiple_result)
                if multiple_result is not True:
                    logging.info("Multiple output execution was unsuccessful, retrying...")
                    continue
                
                # Finish execution and save elements
                parent_result = self._finish_execution()
                time.sleep(TrackingScraperConfig.DEFAULT_WAIT_SHORT)
                break
        except TrackingScraperError:
            logging.exception("Exception ocurred")
        except Exception:
            logging.exception("Unknown exception ocurred")
        finally:
            return parent_result
    
    ###############################################################################################
    
    def _go_to_url(self):
        # Check if general configuration is declared
        general_config = self.__configuration.get("general")
        if general_config is None:
            raise TrackingScraperError("Configuration information not found")
        
        # Get configuration URL
        link = self.__configuration["general"].get("url")
        if link is None:
            raise TrackingScraperError("Configuration URL could not be found")
        
        # Go to desired URL
        try:
            self.__driver.get(link.format(**self.__document))
            time.sleep(TrackingScraperConfig.DEFAULT_WAIT_LONG)
        except TimeoutException:
            raise TrackingScraperError("Error loading Web page, timeout exceeded")
        
        # Start time counting
        return time.time()
    
    ###############################################################################################
    
    def _execute_commands(self, parent_result, key):
        # Check if commands were already executed
        if parent_result is True:
            return True
        
        # Get commands, if none found, return True
        commands = self.__configuration.get(key)
        if commands is None:
            return True
        
        # Process commands
        for command in commands:
            result = TrackingScraperSwitcher(self.__driver, self.__document, self.__configuration,
                                             command).process()
            if result is not True:
                return False
        
        # Return True if everything was OK
        return True
    
    ###############################################################################################
    
    def _execute_multiple_output(self, multiple_result):
        if multiple_result is True:
            return True
        
        # Get multiple command, if none found, return True
        multiple_command = self.__configuration.get("multiple")
        if multiple_command is None:
            return True
        
        # Get configuration key
        multiple_configuration = self.__configuration["general"].get("multiple")
        if multiple_configuration is None:
            return True
        
        # Create multiple document based on single query items
        multiple_document = self._create_query_document(self.__document, self.__single_query)
        # Overwrite previous tracking items, if necessary
        if multiple_command.get("overwrite", TrackingScraperConfig.DEFAULT_KEY_OVERWRITE):
            self.__single_table.delete_many(multiple_document)
        
        # Generate and process multiple documents
        estimated = multiple_configuration.get("estimated", TrackingScraperConfig.DEFAULT_KEY_ESTIMATED)
        multiple_document["estimated"] = estimated
        return self.__process_multiple_elements(multiple_command, multiple_document, self.__driver)
    
    def __process_multiple_elements(self, multiple_command, multiple_document, previous_element):
        # Get single subcommands
        multiple_single_commands = multiple_command.get("single")
        if not isinstance(multiple_single_commands, list):
            raise TrackingScraperError("Multiple command must have single commands key")
        
        # Get command to find parents, if none found, use driver to extract single commands
        multiple_parents = multiple_command.get("parents")
        if multiple_parents is None:
            multiple_elements = [previous_element]
        else:
            multiple_elements = TrackingScraperSwitcher(self.__driver, {}, self.__configuration,
                                                        multiple_parents, previous_element).process()
            if not isinstance(multiple_elements, list):
                raise TrackingScraperError("Parent elements must be a list of web elements")
        
        # Get multiple subcomamnd
        multiple_multiple_command = multiple_command.get("multiple")
        
        # Process every single command for every multiple element
        for multiple_subelement in multiple_elements:
            subdocument = dict(multiple_document)
            for single_command in multiple_single_commands:
                single_result = TrackingScraperSwitcher(self.__driver, subdocument,
                                                        self.__configuration, single_command,
                                                        multiple_subelement).process()
                if single_result is not True:
                    logging.info("Multiple: single subcommand failed")
                    return False
            
            # Check if multiple subcommand exists, if it doesn't, save and continue.
            if multiple_multiple_command is None:
                self._insert_or_update(subdocument, self.__multiple_table, self.__multiple_query)
                continue
            
            # If it exists, copy result document and iterate these new multiple elements with it
            multiple_result = self.__process_multiple_elements(multiple_multiple_command,
                                                               subdocument, multiple_subelement)
            if multiple_result is not True:
                logging.info("Multiple: multiple subcommand failed")
                return False
        
        # Return True to notify everything is OK
        return True
    
    ###############################################################################################
    
    def _finish_execution(self):
        # Get configuration for single element
        single_config = self.__configuration["general"].get("single")
        if single_config is None:
            return True
        
        # Get processed value to save
        processed_value = single_config.get("processed", TrackingScraperConfig.DEFAULT_KEY_PROCESSED)
        self.__document["processed"] = processed_value
        
        # Get collection and upsert container
        return self._insert_or_update(self.__document, self.__single_table, self.__single_query)
    
    def _insert_or_update(self, document, collection, query_keys):
        # Create shallow copy of document, with specified keys, for query
        query_document = self._create_query_document(document, query_keys)
        logging.info("query document: %s", query_document)
        
        # Try to update
        document["updated_at"] = datetime.datetime.utcnow()
        result = collection.update_one(query_document, {"$set": document})
        
        if result.matched_count > 0:
            logging.info("Container updated: %s", query_document)
            return True

        # If update was unsuccessful, insert document
        document["created_at"] = datetime.datetime.utcnow()
        document["updated_at"] = None
        
        result = collection.insert_one(document)
        logging.info("Container insert: %s", query_document)
        return True
    
    def _create_query_document(self, document, query_keys):
        query_document = {}
        for key in query_keys:
            query_document[key] = document.get(key)
        return query_document

## Pruebas unitarias

In [7]:
"""
containers = [
    { "year": "2019", "manifest": "TEST", "detail": "1", "container": "FSCU5670046", "carrier": "Hapag-Lloyd" },
    { "year": "2019", "manifest": "TEST", "detail": "2", "container": "HLXU5183586", "carrier": "Hapag-Lloyd" },
    { "year": "2019", "manifest": "TEST", "detail": "3", "container": "MAEU6835658", "carrier": "Maersk" },
    { "year": "2019", "manifest": "TEST", "detail": "4", "container": "EGSU9089973", "carrier": "Evergreen" },
    { "year": "2019", "manifest": "TEST", "detail": "5", "container": "TEMU3806660", "carrier": "Textainer" }]
"""

'\ncontainers = [\n    { "year": "2019", "manifest": "TEST", "detail": "1", "container": "FSCU5670046", "carrier": "Hapag-Lloyd" },\n    { "year": "2019", "manifest": "TEST", "detail": "2", "container": "HLXU5183586", "carrier": "Hapag-Lloyd" },\n    { "year": "2019", "manifest": "TEST", "detail": "3", "container": "MAEU6835658", "carrier": "Maersk" },\n    { "year": "2019", "manifest": "TEST", "detail": "4", "container": "EGSU9089973", "carrier": "Evergreen" },\n    { "year": "2019", "manifest": "TEST", "detail": "5", "container": "TEMU3806660", "carrier": "Textainer" }]\n'

In [10]:
containers = []
query = {
    "carrier": "Textainer",
    "processed": False
}
for container in container_table.find(query).sort("_id", -1):
    containers.append(container)
len(containers)

7595

In [None]:
fail_counter = 0
start = time.time()
driver = webdriver.Chrome(executable_path = TrackingScraperConfig.DEFAULT_PATH_CHROME)
for container in containers:
    if fail_counter >= 10:
        logging.error("Too much failures, aborting")
        break
    cont_start = time.time()
    try:
        scraper = TrackingScraper(driver, database, container)
        if not scraper.execute():
            fail_counter = fail_counter + 1
            logging.error("Scraper for container %s unsuccessful", container["container"])
    except TrackingScraperError as ex:
        fail_counter = fail_counter + 1
        logging.error("Error extracting container information: %s", str(ex))
        continue
    except Exception:
        fail_counter = fail_counter + 1
        logging.exception("Unknown exception ocurred when creating or executing scraper")
        break
    finally:
        cont_end = time.time()
        print("Container time:", cont_end - cont_start, "seconds")
# input("Press Enter to quit")
driver.close()
end = time.time()
print("Total time:", end - start, "seconds")