# Scrabu Prototype (for a Single Shipment Number)

This notebook is for creating a prototype of the Scrabu project. The goal is to download DHL pages for a specific shipment number and scrap the shipment information from it.

In [36]:
# request & response handler
import requests
import numpy as np

def request(start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_number=None):
    start_shipment_number = 340434188193324407
    shipment_numbers_list = []
    for i in range(1, 100):
        shipment_numbers = list(map(int, str(shipment_number)))
        shipment_number = str(shipment_number).rjust(20, '0')
        shipment_numbers_list.append(shipment_number)
    
    for shipment_number in shipment_numbers_list:
        request_url = start_url + shipment_number
        return requests.get(request_url).content

In [37]:
html_response = request(shipment_number="00340434188193324407")

In [27]:
# parser
from lxml import etree
import json

def html_to_json(html, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_number="00340434188193324407"):
    html_tree = etree.HTML(html)
    
    def clean_json(json):
        start = json.find('JSON.parse(')
        end = json.find('"),', start)
        json = json[start:end]
        json = json.replace('JSON.parse("', '')
        json = json.replace('\\', '')
        return json
    
    def find_json_element(html_tree):
        json_element = html_tree.xpath('//div')
        return str(etree.tostring(json_element[0]))
    
    dirty_json = find_json_element(html_tree)
    json_string = clean_json(dirty_json)
    return json.loads(json_string)

In [28]:
shipment_details_json = html_to_json(html_response)

In [29]:
import datetime


def shipment_details(shipment_details_json, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_number="00340434188193324407"):
    delivery_history_dict = {}
    
    delivery_history_dict["shipment_number"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsnummern"].get("sendungsnummer")
    delivery_history_dict["crawltime"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    delivery_history_dict["url"] = start_url + shipment_number 
    delivery_history_dict["events"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsverlauf"].get("events", [])
    return delivery_history_dict

In [30]:
shipment_history = shipment_details(shipment_details_json)

In [31]:
import json
def save_dictionary(shipment_history=None, filename=None):
    with open(filename, 'w') as f:
        json.dump(shipment_history, f)    

In [32]:
save_dictionary(shipment_history=shipment_history, filename="../data/00340434188193324407.json")