# Find manifests

In [1]:
from datetime import datetime, timedelta
from pymongo import MongoClient
from selenium import webdriver

import logging
import time

In [2]:
today = datetime.now().strftime("%Y%m%d")
logging.basicConfig(filename = "../../logs/manifests-" + today + ".log", level = logging.INFO,
                    format = "[%(levelname)s %(asctime)s] %(message)s")

client = MongoClient()
database = client["scraper2"]
manifests_table = database["manifests"]

## Input

In [3]:
# Go to URL
driver = webdriver.Chrome(executable_path = "../../driver/chromedriver")
driver.get("http://www.aduanet.gob.pe/aduanas/informao/HRMCFLlega.htm")
time.sleep(5)

# Set date
element = driver.find_elements_by_name("CMc2_Fecha1")[0]
value = datetime.today() + timedelta(-2)
element.clear()
element.send_keys(datetime.strftime(value, "%d/%m/%Y"))
time.sleep(1)

# Submit
button = driver.find_elements_by_css_selector("input[type=button]")[0]
button.click()
time.sleep(5)

## Output

In [4]:
def insert_or_update(collection, document, query_keys):
    # Create shallow copy of document, with specified keys, for query
    query_doc = {}
    for key in query_keys:
        query_doc[key] = document[key]
    
    # Create shallow copy of document for updating
    update_doc = dict(document)
    update_doc["updated_at"] = datetime.utcnow()
    
    # Try to update
    result = collection.update_one(query_doc, {
        "$set": update_doc
    })
    if result.matched_count > 0:
        logging.info("updated: %s", query_doc)
        return
    
    # If update was unsuccessful, insert document
    document["created_at"] = datetime.utcnow()
    document["updated_at"] = None
    document["processed"] = False
    
    result = collection.insert_one(document)
    logging.info("insert: %s", query_doc)

In [5]:
# Get table
table = driver.find_elements_by_tag_name("table")[3]

# Get manifest rows
rows = table.find_elements_by_css_selector("tr:not(:first-child)")
for row in rows:
    manifest = {}
    
    # Get cells
    cells = row.find_elements_by_tag_name("td")
    
    # Get manifest year and number
    split = cells[0].text.split(" - ")
    manifest["year"] = "20" + split[0]
    manifest["manifest"] = split[1]
    logging.info("manifest: %s-%s", manifest["year"], manifest["manifest"])
    
    # Get manifest arrival date
    text = cells[1].text.strip()
    try:
        date = datetime.strptime(text, "%d/%m/%Y")
        manifest["arrival"] = date
    except ValueError:
        logging.info("arrival info not found")
        manifest["arrival"] = None
    
    # Get manifest discharge date
    text = cells[2].text.strip()
    try:
        date = datetime.strptime(text, "%d/%m/%Y")
        manifest["discharge"] = date
    except ValueError:
        logging.info("discharge info not found")
        manifest["discharge"] = None
    
    # Get manifest ship
    text = cells[3].text.strip()
    manifest["ship"] = text
    
    # Save to database if it exists
    insert_or_update(manifests_table, manifest, ["year", "manifest"])

In [6]:
driver.close()