# Scrabu Prototype (for a List of Shipment Numbers)

This notebook is for creating a prototype of the Scrabu project. The goal is to download DHL pages for a specific shipment number and scrap the shipment information from it.

#### Generate a list of shipment numbers

In [25]:
def generate_shipment_numbers(start_shipment_number=340434188193324407, size=100):
    import numpy as np
    shipment_numbers_list = []
    for i in np.arange(1, size):
        shipment_number = start_shipment_number + i
        shipment_numbers = list(map(int, str(shipment_number)))
        shipment_number = str(shipment_number).rjust(20, '0')
        shipment_numbers_list.append(shipment_number)
    return shipment_numbers_list

In [26]:
shipment_numbers = generate_shipment_numbers(start_shipment_number=340434188193324407, size=100)
print("First elements of the list", shipment_numbers[0:100])

First elements of the list ['00340434188193324408', '00340434188193324409', '00340434188193324410', '00340434188193324411', '00340434188193324412', '00340434188193324413', '00340434188193324414', '00340434188193324415', '00340434188193324416', '00340434188193324417', '00340434188193324418', '00340434188193324419', '00340434188193324420', '00340434188193324421', '00340434188193324422', '00340434188193324423', '00340434188193324424', '00340434188193324425', '00340434188193324426', '00340434188193324427', '00340434188193324428', '00340434188193324429', '00340434188193324430', '00340434188193324431', '00340434188193324432', '00340434188193324433', '00340434188193324434', '00340434188193324435', '00340434188193324436', '00340434188193324437', '00340434188193324438', '00340434188193324439', '00340434188193324440', '00340434188193324441', '00340434188193324442', '00340434188193324443', '00340434188193324444', '00340434188193324445', '00340434188193324446', '00340434188193324447', '00340434188

#### Download the HTML content for a list of shipment numbers

In [27]:
def request(shipment_numbers, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode="):
    import requests
    import numpy as np
    for shipment_number in shipment_numbers:
        request_url = start_url + shipment_number
    return requests.get(request_url).content

In [28]:
for shipment_number in shipment_numbers:
    html_response = request(shipment_number)
    print("Sample of HTML response:\n", html_response[0:100])

Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STAT

Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STATE__ = {\n    '
Sample of HTML response:
 b'\n<div id="verfolgen-main-content">\n  \n\n\n\n\n\n\t\n\n\n<script>\n  \n    window.__INITIAL_APP_STAT

#### Parsing HTML and converting it into JSON

In [44]:
def html_to_json(html, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_numbers=None):
    from lxml import etree
    import json
    
    html_tree = etree.HTML(html)
    
    def clean_json(json):
        for i in html_response:
            start = json.find('JSON.parse(')
            end = json.find('"),', start)
            json = json[start:end]
            json = json.replace('JSON.parse("', '')
            json = json.replace('\\', '')
    return json
    
    def find_json_element(html_tree):
        json_element = html_tree.xpath('//div')
        return str(etree.tostring(json_element[0]))
    
    dirty_json = find_json_element(html_tree)
    json_string = clean_json(dirty_json)
    return json.loads(json_string)

In [45]:
shipment_details_json = html_to_json(html_response)
print(shipment_details_json)

<module 'json' from '/opt/conda/lib/python3.7/json/__init__.py'>


#### Structure the shipment details in JSON format

In [50]:
def shipment_details(shipment_details_json, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_number="00340434188193324407"):
    import datetime
    for i in shipment_details_json:
        delivery_history_dict = {}
        delivery_history_dict["shipment_number"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsnummern"].get("sendungsnummer")
        delivery_history_dict["crawltime"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        delivery_history_dict["url"] = start_url + shipment_number 
        delivery_history_dict["events"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsverlauf"].get("events", [])
        return delivery_history_dict

In [51]:
shipment_history = shipment_details(shipment_details_json)
print(shipment_history)

TypeError: 'module' object is not iterable

In [12]:
import json
def save_dictionary(shipment_history=None, filename=None):
    print("Writing file {}".format(filename))
    with open(filename, 'w') as f:
        json.dump(shipment_history, f)

In [13]:
save_dictionary(shipment_history, filename="../data/00340434188193324407.json")

Writing file ../data/00340434188193324407.json


#### Main function, which goes through all shipment numbers and uses the previous methods

In [14]:
import time
def main():
    shipment_numbers = generate_shipment_numbers(shipment_number=340434188193324407, size=100)
    for shipment_number in shipment_numbers:
        time.sleep(0.5)
        print("Processing shipment number {}".format(shipment_number))
        html_response = request(shipment_number=shipment_number)
        shipment_details_json = html_to_json(html_response)
        shipment_history = shipment_details(shipment_details_json)
        save_dictionary(shipment_history, filename="../data/{}.json".format(shipment_number))

In [15]:
main()

TypeError: generate_shipment_numbers() got an unexpected keyword argument 'shipment_number'