# Find knowledges

In [1]:
from datetime import datetime, timedelta
from pymongo import MongoClient
from selenium import webdriver

import json
import time
import logging

In [2]:
logging.basicConfig(filename = "../../logs/knowledges-20190327.log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["tracking_scraper"]
single_collection = database["manifests"]
multiple_collection = database["knowledges"]

In [3]:
cursor = single_collection.find({
    "processed": False
})
manifests = []
for manifest in cursor:
    manifests.append(manifest)
print(len(manifests))

# 255 - 52

203


## Input

In [4]:
def execute_input(driver):
    driver.get(("http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias"
                "?accion=cargarFrmConsultaManifiesto&tipo=M"))
    time.sleep(5)

    # Write year input
    year_input = driver.find_elements_by_name("CMc1_Anno")[0]
    year_input.clear()
    year_input.send_keys(manifest["year"])
    time.sleep(1)

    # Write manifest input
    manifest_input = driver.find_elements_by_name("CMc1_Numero")[0]
    manifest_input.clear()
    manifest_input.send_keys(manifest["manifest"])
    time.sleep(1)

    # Click
    button = driver.find_elements_by_css_selector("input[type=button]")[0]
    button.click()
    time.sleep(5)

## Output

In [5]:
def get_output_parent(driver):
    return driver.find_elements_by_tag_name("table")

In [6]:
def execute_single_output(driver, manifest, tables):
    # Get table
    table = tables[2]
    # Get rows
    rows = table.find_elements_by_tag_name("tr")

    # Get arrival date
    cell = rows[1].find_elements_by_tag_name("td")[1]
    text = cell.text.strip()
    try:
        date = datetime.strptime(text, "%d/%m/%Y %H:%M")
        manifest["arrival"] = date - timedelta(hours = -5)
    except ValueError:
        logging.info("arrival date could not be read")
        # manifest["arrival"] = None

    # Get discharge date
    cell = rows[2].find_elements_by_tag_name("td")[1]
    text = cell.text.strip()
    try:
        date = datetime.strptime(text, "%d/%m/%Y %H:%M")
        manifest["discharge"] = date - timedelta(hours = -5)
    except ValueError:
        logging.info("discharge date could not be read")
        # manifest["discharge"] = None

    # Get ship
    cell = rows[3].find_elements_by_tag_name("td")[1]
    manifest["ship"] = cell.text.strip()

    # Get transport enterprise
    cell = rows[4].find_elements_by_tag_name("td")[1]
    manifest["enterprise"] = cell.text.strip()

    # Get transmission date (?)
    try:
        cell = rows[7].find_elements_by_tag_name("td")[1]
        text = cell.text.strip()
        try:
            date = datetime.strptime(text, "%d/%m/%Y %H:%M:%S")
            manifest["transmission"] = date - timedelta(hours = -5)
        except ValueError as ex:
            logging.info("transmission date could not be read")
            manifest["transmission"] = None
    except IndexError:
        logging.info("transmission date not found")
        manifest["transmission"] = None

In [7]:
def insert_or_update(collection, document, query_keys):
    # Create shallow copy of document, with specified keys, for query
    query_doc = {}
    for key in query_keys:
        query_doc[key] = document[key]
    
    # Create shallow copy of document for updating
    update_doc = dict(document)
    update_doc["updated_at"] = datetime.utcnow()
    
    # Try to update
    result = collection.update_one(query_doc, {
        "$set": update_doc
    })
    if result.matched_count > 0:
        logging.info("updated: %s", query_doc)
        return
    
    # If update was unsuccessful, insert document
    document["created_at"] = datetime.utcnow()
    document["updated_at"] = None
    if "processed" not in document:
        document["processed"] = False

    result = collection.insert_one(document)
    logging.info("insert: %s", query_doc)

In [8]:
def execute_multiple_output(driver, manifest, tables):
    # Get table
    try:
        table = tables[3]
    except IndexError:
        logging.info("no knowledges found")
        return
    
    # Get rows
    rows = table.find_elements_by_css_selector("tr:not(:first-child)")
    logging.info("%d knowledges found", len(rows))
    
    for row in rows:
        # Get cells
        cells = row.find_elements_by_tag_name("td")

        # Declare knowledge
        knowledge = {
            "year": manifest["year"],
            "manifest": manifest["manifest"]
        }
        
        # Get origin port
        knowledge["origin_port"] = cells[0].text.strip()

        # Get knowledge identifier
        knowledge["knowledge"] = cells[2].text.strip()

        # Get detail number
        knowledge["detail"] = cells[4].text.strip()
        
        # Get weight
        text = cells[8].text.strip()
        try:
            knowledge["weight"] = float(text.replace(",", ""))
        except ValueError:
            logging.info("weight could not be read")
            knowledge["weight"] = None
        
        # Get package count
        text = cells[9].text.strip()
        try:
            knowledge["package_count"] = int(text.replace(",", ""))
        except ValueError:
            logging.info("package count could not be read")
            knowledge["package_count"] = None

        # Get consignee
        knowledge["consignee"] = cells[14].text.strip()

        # Get shipper
        knowledge["shipper"] = cells[15].text.strip()
        
        # Get destiny port
        knowledge["destiny_port"] = cells[17].text.strip()
        
        # Get transmission date
        text = cells[20].text.strip()
        try:
            date = datetime.strptime(text, "%d/%m/%Y %I:%M:%S %p")
            knowledge["transmission"] = date - timedelta(hours = -5)
        except ValueError:
            logging.info("transmission could not be read")
            knowledge["transmission"] = None

        # Save knowledge
        insert_or_update(multiple_collection, knowledge, ["year", "manifest", "detail"])

In [9]:
def update_single(driver, manifest):
    manifest["processed"] = True
    insert_or_update(single_collection, manifest, ["year", "manifest"])
    time.sleep(5)

## Check database

In [10]:
def check_database():
    for knowledge in multiple_collection.find():
        print(knowledge)
        text = input("Press Enter to continue, or type 'quit' to cancel: ")
        if text.lower() == "quit":
            break

## Iterate through manifests to be processed

In [11]:
driver = webdriver.Chrome(executable_path = "../../driver/chromedriver")
for manifest in manifests:
    try:
        execute_input(driver)
        tables = get_output_parent(driver)
        execute_single_output(driver, manifest, tables)
        execute_multiple_output(driver, manifest, tables)
        update_single(driver, manifest)
    except Exception as ex:
        logging.exception("Exception ocurred")
        break
driver.close()

In [12]:
single_collection.count_documents({
    "processed": False
})

0