# Scrabu Prototype (for a List of Shipment Numbers)

This notebook is for creating a prototype of the Scrabu project. The goal is to download DHL pages for a specific shipment number and scrap the shipment information from it.

#### Configure Logger

In [65]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

#### Generate a list of shipment numbers

In [45]:
def generate_shipment_numbers(shipment_number=None, size=2): #340434188193324407
    logger.info("Generating shipment numbers with seed: {}".format(shipment_number))
    import numpy as np
    shipment_numbers_list = []
    for i in range(0, size):
        shipment_number_str = str(shipment_number).rjust(20, '0')
        shipment_numbers_list.append(shipment_number_str)
        shipment_number = shipment_number + 1
    logger.info("Generated {} unique shipment numbers".format(len(set(shipment_numbers_list))))
    return shipment_numbers_list

In [46]:
shipment_numbers = generate_shipment_numbers(shipment_number=340434188193323500, size=50)

04:10:29 INFO: Generating shipment numbers with seed: 340434188193323500
04:10:29 INFO: Generated 50 unique shipment numbers


#### Download the HTML content for a list of shipment numbers

In [47]:
def request(shipment_number=None, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode="):
    import requests
    import numpy as np
    logger.debug("Making HTTP request for shipment number {}".format(shipment_number))
    request_url = start_url + str(shipment_number)
    return requests.get(request_url).content

In [48]:
html_response = request(shipment_number="00340434188193323500")

04:10:29 DEBUG: Making HTTP request for shipment number 00340434188193323500
04:10:29 DEBUG: Starting new HTTPS connection (1): www.dhl.de:443
04:10:29 DEBUG: https://www.dhl.de:443 "GET /int-verfolgen/search?language=de&lang=de&domain=de&piececode=00340434188193323500 HTTP/1.1" 200 1072


#### Parsing HTML and converting it into JSON

In [49]:
def html_to_json(html):
    from lxml import etree
    import json
    logger.debug("Converting HTML to JSON")
    
    html_tree = etree.HTML(html)
    
    def clean_json(dirty_json):
        start = dirty_json.find('JSON.parse(')
        end = dirty_json.find('"),', start)
        cjson = dirty_json[start:end]
        cjson = cjson.replace('JSON.parse("', '')
        cjson = cjson.replace('\\', '')
        return cjson
    
    def find_json_element(html_tree):
        json_element = html_tree.xpath('//div')
        return str(etree.tostring(json_element[0]))
    
    dirty_json = find_json_element(html_tree)
    json_string = clean_json(dirty_json)
    return json.loads(json_string)

In [50]:
shipment_details_json = html_to_json(html_response)

04:10:30 DEBUG: Converting HTML to JSON


#### Structure the shipment details in JSON format

In [51]:
def shipment_details(shipment_details_json, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_number="00340434188193323500"):
    import datetime
    logger.debug("Preparing JSON for persistance")
    delivery_history_dict = {}
    delivery_history_dict["shipment_number"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsnummern"].get("sendungsnummer")
    delivery_history_dict["crawltime"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    delivery_history_dict["url"] = start_url + str(shipment_number) 
    delivery_history_dict["events"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsverlauf"].get("events", [])
    return delivery_history_dict

In [52]:
shipment_history = shipment_details(shipment_details_json)

04:10:31 DEBUG: Preparing JSON for persistance


In [53]:
import json
def save_dictionary(shipment_history=None, filename=None):
    logger.info("Writing file {}".format(filename))
    with open(filename, 'w') as f:
        json.dump(shipment_history, f)

In [54]:
save_dictionary(shipment_history=None, filename="../data/00340434188193323500.json")

04:10:31 INFO: Writing file ../data/00340434188193323500.json


#### Main function, which goes through all shipment numbers and uses the previous methods

In [69]:
import time
def main(shipment_number=None, size=None):
    shipment_numbers = generate_shipment_numbers(shipment_number=shipment_number, size=size)
    count_saved = 0
    for shipment_number in shipment_numbers:
        time.sleep(0.5)
        html_response = request(shipment_number=shipment_number)
        shipment_details_json = html_to_json(html_response)
        shipment_history = shipment_details(shipment_details_json, shipment_number=shipment_number)
        if len(shipment_history['events']) > 0:
            save_dictionary(shipment_history, filename="../data/{}.json".format(shipment_number))
            count_saved = count_saved + 1
        else:
            logger.info("No events found for shipment number {}".format(shipment_number))
    logger.info("Saved {} files out of {}".format(count_saved, size))

In [77]:
main(shipment_number=340434174857837116, size=30)

04:29:05 INFO: Generating shipment numbers with seed: 340434174857837116
04:29:05 INFO: Generated 30 unique shipment numbers
04:29:06 INFO: Writing file ../data/00340434174857837116.json
04:29:06 INFO: No events found for shipment number 00340434174857837117
04:29:07 INFO: No events found for shipment number 00340434174857837118
04:29:08 INFO: No events found for shipment number 00340434174857837119
04:29:08 INFO: No events found for shipment number 00340434174857837120
04:29:09 INFO: No events found for shipment number 00340434174857837121
04:29:10 INFO: No events found for shipment number 00340434174857837122
04:29:11 INFO: Writing file ../data/00340434174857837123.json
04:29:12 INFO: No events found for shipment number 00340434174857837124
04:29:12 INFO: No events found for shipment number 00340434174857837125
04:29:13 INFO: No events found for shipment number 00340434174857837126
04:29:14 INFO: No events found for shipment number 00340434174857837127
04:29:14 INFO: No events found 