# Find manifests

In [1]:
from datetime import datetime, timedelta
from pymongo import MongoClient
from selenium import webdriver

import time

In [2]:
client = MongoClient()
database = client["tracking_scraper"]
multiple_collection = database["manifests"]

## Input

In [3]:
# Go to URL
driver = webdriver.Chrome(executable_path = "../../driver/chromedriver")
driver.get("http://www.aduanet.gob.pe/aduanas/informao/HRMCFLlega.htm")
time.sleep(5)

# Set date
element = driver.find_elements_by_name("CMc2_Fecha1")[0]
value = datetime.today() + timedelta(-26)
element.clear()
element.send_keys(datetime.strftime(value, "%d/%m/%Y"))
time.sleep(1)

# Submit
button = driver.find_elements_by_css_selector("input[type=button]")[0]
button.click()
time.sleep(5)

## Output

In [26]:
def insert_or_update(collection, document, query_keys):
    # Create shallow copy of document, with specified keys, for query
    query_doc = {}
    for key in query_keys:
        query_doc[key] = document[key]
    
    # Create shallow copy of document for updating
    update_doc = dict(document)
    update_doc["updated_at"] = datetime.utcnow()
    
    # Try to update
    result = collection.update_one(query_doc, {
        "$set": update_doc
    })
    if result.matched_count > 0:
        print("updated:", query_doc)
        return
    
    # If update was unsuccessful, insert document
    document["created_at"] = datetime.utcnow()
    document["updated_at"] = None
    document["processed"] = False
    
    result = collection.insert_one(insert_doc)
    print("insert:", query_doc, result.inserted_id)

In [4]:
# Get table
table = driver.find_elements_by_tag_name("table")[3]

# Get manifest rows
rows = table.find_elements_by_css_selector("tr:not(:first-child)")
for row in rows:
    manifest = {}
    
    # Get cells
    cells = row.find_elements_by_tag_name("td")
    
    # Get manifest year and number
    split = cells[0].text.split(" - ")
    manifest["year"] = "20" + split[0]
    manifest["manifest"] = split[1]
    
    # Get manifest arrival date
    text = cells[1].text.strip()
    try:
        date = datetime.strptime(text, "%d/%m/%Y")
        manifest["arrival"] = date
    except ValueError:
        manifest["arrival"] = None
    
    # Get manifest discharge date
    text = cells[2].text.strip()
    try:
        date = datetime.strptime(text, "%d/%m/%Y")
        manifest["discharge"] = date
    except ValueError:
        manifest["discharge"] = None
    
    # Get manifest ship
    text = cells[3].text.strip()
    manifest["ship"] = text
    
    # Save to database if it exists
    insert_or_update(multiple_collection, manifest, ["year", "manifest"])

In [5]:
driver.close()

## Check database

In [6]:
for manifest in multiple_collection.find():
    print(manifest)
    text = input("Continue by pressing Enter, or type 'quit' to exit: ")
    if text.lower() == "quit":
        break

{'_id': ObjectId('5c92e2b69303fc1fa350e804'), 'processed': False, 'discharge': datetime.datetime(2019, 2, 23, 0, 0), 'manifest': '456', 'updated_at': None, 'year': '2019', 'created_at': datetime.datetime(2019, 3, 21, 1, 2, 46, 665000), 'ship': 'SAN ANTONIO EXPRESS', 'arrival': datetime.datetime(2019, 2, 22, 0, 0)}
Continue by pressing Enter, or type 'quit' to exit: 
{'_id': ObjectId('5c92e2b69303fc1fa350e805'), 'processed': False, 'discharge': None, 'manifest': '462', 'updated_at': None, 'year': '2019', 'created_at': datetime.datetime(2019, 3, 21, 1, 2, 46, 750000), 'ship': 'FAIRCHEM FORTE', 'arrival': datetime.datetime(2019, 2, 22, 0, 0)}
Continue by pressing Enter, or type 'quit' to exit: 
{'_id': ObjectId('5c92e2b69303fc1fa350e806'), 'processed': False, 'discharge': datetime.datetime(2019, 2, 23, 0, 0), 'manifest': '400', 'updated_at': None, 'year': '2019', 'created_at': datetime.datetime(2019, 3, 21, 1, 2, 46, 827000), 'ship': 'MOL BEYOND', 'arrival': datetime.datetime(2019, 2, 22,