In [1]:
from datetime import datetime, timedelta
from pymongo import MongoClient
from selenium import webdriver

import time
import logging

In [2]:
today = datetime.now().strftime("%Y%m%d")
logging.basicConfig(filename = "../../logs/containers-" + today + ".log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["scraper3"]
knowledge_table = database["knowledges"]
container_table = database["containers"]
container_prefixes = database["container_prefixes"]

## Input

In [3]:
def execute_input(driver, knowledge):
    # logging.info("knowledge: %s", knowledge)
    link = ("http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=consultar"
            "DetalleConocimientoEmbarque&CMc2_Anno={year}&CMc2_Numero={manifest}&CMc2_NumDet={detail}&"
            "CG_cadu=118&CMc2_TipM=mc&CMc2_numcon={knowledge}").format(**knowledge)
    driver.get(link)
    time.sleep(5)

## Interaction with the database

In [4]:
def insert_or_update(collection, document, query_keys):
    # Create shallow copy of document, with specified keys, for query
    query_doc = {}
    for key in query_keys:
        query_doc[key] = document[key]
    
    # Create shallow copy of document for updating
    # update_doc = dict(document)
    # update_doc["updated_at"] = datetime.utcnow()
    
    # Try to update
    result = collection.update_one(query_doc, {
        "$set": document
    })
    if result.matched_count > 0:
        logging.info("updated: %s", query_doc)
        return
    
    # If update was unsuccessful, insert document
    # document["created_at"] = datetime.utcnow()
    # document["updated_at"] = None
    if "processed" not in document:
        document["processed"] = False

    result = collection.insert_one(document)
    logging.info("insert: %s", query_doc)

## Output

In [5]:
def get_output_parent(driver):
    # Get tables
    tables = driver.find_elements_by_tag_name("table")
    
    # Check if it has enough tables
    if len(tables) < 3:
        raise Exception("not enough tables, driver may be in a 404 or 503 error page, aborting.")
    if len(tables) < 7:
        logging.info("no containers found")
        return None
    
    return tables

In [6]:
def execute_multiple_output(driver, knowledge, parent):
    # Get rows
    detail_rows = parent[2].find_elements_by_css_selector("tr:not(:first-child)")
    container_rows = parent[3].find_elements_by_css_selector("tr:not(:first-child)")
    logging.info("%d containers found", len(container_rows))
    
    # Iterate through rows
    for index, row in enumerate(container_rows):
        container = {}
        
        # Get cells
        cells = row.find_elements_by_tag_name("td")

        # Get container number
        container["container"] = cells[0].text.strip()
        
        # Get container size
        try:
            container["sunat_size"] = cells[1].text.strip()
        except IndexError:
            container["sunat_size"] = None
            logging.info("container size not found")
        
        # Get container tara
        try:
            text = cells[5].text.strip()
            container["sunat_tara"] = float(text.replace(",", ""))
        except IndexError:
            container["sunat_tara"] = None
            logging.info("container tara not found")
        except ValueError:
            container["sunat_tara"] = None
            logging.info("container tara could not be read")
        
        try:
            detail_cells = detail_rows[index].find_elements_by_tag_name("td")
            
            # Get detail package count
            try:
                text = detail_cells[0].text.strip()
                container["sunat_package_count"] = int(text.replace(",", ""))
            except IndexError:
                container["sunat_package_count"] = None
                logging.info("package count not found")
            except ValueError:
                container["sunat_package_count"] = None
                logging.info("package count could not be processed")
            
            # Get detail weight
            try:
                text = detail_cells[1].text.strip()
                container["sunat_weight"] = float(text.replace(",", ""))
            except IndexError:
                container["sunat_weight"] = None
                logging.info("weight not found")
            except ValueError:
                container["sunat_weight"] = None
                logging.info("weight could not be processed")
        
        except IndexError:
            logging.info("detail row not found")
            
        # Check if container has carrier assigned
        result = container_prefixes.find_one({
            "prefix": container["container"][:4].upper()
        })
        if result is not None:
            # logging.warning("carrier found! Use this as quick as you can.")
            container["carrier"] = result["carrier"]
        else:
            # logging.info("carrier not found")
            container["carrier"] = None
        
        # Save container
        insert_or_update(container_table, container, ["container"])

In [7]:
def update_single(driver, knowledge):
    knowledge["processed"] = True
    insert_or_update(knowledge_table, knowledge, ["year", "manifest", "detail"])
    time.sleep(5)

## Iteration

In [8]:
while True:
    # Find knowledges
    cursor = knowledge_table.find({"processed": False}).limit(1000)
    knowledges = list(cursor)
    if len(knowledges) <= 0:
        break
    # print(knowledges_count, "knowledges left")
    
    # Execute scraper
    driver = webdriver.Chrome(executable_path = "../../driver/chromedriver")
    for knowledge in knowledges:
        try:
            execute_input(driver, knowledge)
            parent = get_output_parent(driver)
            if parent is not None:
                execute_multiple_output(driver, knowledge, parent)
            update_single(driver, knowledge)
        except Exception:
            logging.exception("Exception occured")
            break
    driver.close()
    # Sleep 1 minute
    time.sleep(60)

In [9]:
try:
    driver.close()
except:
    pass