# Scrabu Prototype (for a List of Shipment Numbers)

This notebook is for creating a prototype of the Scrabu project. The goal is to download DHL pages for a specific shipment number and scrap the shipment information from it.

#### Configure Logger

In [216]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO, datefmt='%I:%M:%S')

#### Generate a list of shipment numbers with the check digit calculation

In [217]:
def generate_shipment_numbers(shipment_number=None, size=2): #340434188193324407
    logger.info("Generating shipment numbers with seed: {}".format(shipment_number))
    import numpy as np
    from functools import reduce
    multiplier = [3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3]
    shipment_numbers_list = []
    for i in range(0, size):
        shipment_number = (shipment_number // 10) + 1
        shipment_number_l = list(map(int, str(shipment_number)))
        multiply_number = np.multiply(multiplier, shipment_number_l)
        sum = reduce(lambda x, y: x+y, multiply_number)
        pz = (10 - sum % 10)
        shipment_number_l.append(0 if pz==10 else pz)
        shipment_number = reduce(lambda x,y: x * 10 + y, shipment_number_l)
        shipment_number_str = str(shipment_number).rjust(20, '0')
        shipment_numbers_list.append(shipment_number_str)
    logger.info("Generated {} unique shipment numbers".format(len(set(shipment_numbers_list))))
    return shipment_numbers_list

In [218]:
shipment_numbers = generate_shipment_numbers(shipment_number=340434188193323500, size=50)

03:06:42 INFO: Generating shipment numbers with seed: 340434188193323500
03:06:42 INFO: Generated 50 unique shipment numbers


#### Download the HTML content for a list of shipment numbers

In [219]:
def request(shipment_number=None, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode="):
    import requests
    import numpy as np
    logger.debug("Making HTTP request for shipment number {}".format(shipment_number))
    request_url = start_url + str(shipment_number)
    return requests.get(request_url).content

In [220]:
html_response = request(shipment_number="00340434188193323500")

#### Parsing HTML and converting it into JSON

In [221]:
def html_to_json(html):
    from lxml import etree
    import json
    logger.debug("Converting HTML to JSON")
    
    html_tree = etree.HTML(html)
    
    def clean_json(dirty_json):
        start = dirty_json.find('JSON.parse(')
        end = dirty_json.find('"),', start)
        cjson = dirty_json[start:end]
        cjson = cjson.replace('JSON.parse("', '')
        cjson = cjson.replace('\\', '')
        return cjson
    
    def find_json_element(html_tree):
        json_element = html_tree.xpath('//div')
        return str(etree.tostring(json_element[0]))
    
    dirty_json = find_json_element(html_tree)
    json_string = clean_json(dirty_json)
    return json.loads(json_string)

In [222]:
shipment_details_json = html_to_json(html_response)

#### Structure the shipment details in JSON format

In [223]:
def shipment_details(shipment_details_json, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode=", shipment_number="00340434188193323500"):
    import datetime
    logger.debug("Preparing JSON for persistance")
    delivery_history_dict = {}
    delivery_history_dict["shipment_number"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsnummern"].get("sendungsnummer")
    delivery_history_dict["crawltime"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    delivery_history_dict["url"] = start_url + str(shipment_number) 
    delivery_history_dict["events"] = shipment_details_json["sendungen"][0]["sendungsdetails"]["sendungsverlauf"].get("events", [])
    return delivery_history_dict

In [224]:
shipment_history = shipment_details(shipment_details_json)

In [225]:
import json
def save_dictionary(shipment_history=None, filename=None):
    logger.info("Writing file {}".format(filename))
    with open(filename, 'w') as f:
        json.dump(shipment_history, f)

In [226]:
save_dictionary(shipment_history=None, filename="../data/00340434188193323500.json")

03:06:59 INFO: Writing file ../data/00340434188193323500.json


#### Main function, which goes through all shipment numbers and uses the previous methods

In [235]:
import time
from concurrent import futures
from concurrent.futures import ThreadPoolExecutor as PoolExecutor
def main(shipment_number=None, size=None, start_url="https://www.dhl.de/int-verfolgen/search?language=de&lang=de&domain=de&piececode="):
    shipment_numbers = generate_shipment_numbers(shipment_number=shipment_number, size=size)
    count_saved = 0
    def process_data():
        for shipment_number in shipment_numbers:
            time.sleep(0.5)
            html_response = request(shipment_number=shipment_number)
            shipment_details_json = html_to_json(html_response)
            shipment_history = shipment_details(shipment_details_json, shipment_number=shipment_number)
            if len(shipment_history['events']) > 0:
                save_dictionary(shipment_history, filename="../data/{}.json".format(shipment_number))
                count_saved = count_saved + 1
            else:
                logger.info("No events found for shipment number {}".format(shipment_number))
            start_index = start_index + 1
    with PoolExecutor(max_workers=4) as executor:
        futures_to_url = [executor.submit(process_data, request_url)
        for future in concurrent.futures.as_completed(futures_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                print('%r page is %d bytes' % (url, len(data)))
                
        logger.info("Saved {} files out of {}".format(count_saved, size))
    end = time.time()
    diff = end - start
    print(diff)


SyntaxError: invalid syntax (<ipython-input-235-205e7ea3940c>, line 21)

In [236]:
main(shipment_number=340434188193323500, size=100)

03:12:10 INFO: Generating shipment numbers with seed: 340434188193323500
03:12:10 INFO: Generated 100 unique shipment numbers


NameError: name 'future_to_url' is not defined

In [None]:
import concurrent.futures
import urllib.request
import time

start = time.time()

URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://europe.wsj.com/',
        'http://www.bbc.co.uk/']

# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
    with urllib.request.urlopen(url, timeout=timeout) as conn:
        time.sleep(10)
        return conn.read()

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))
    
end = time.time()
diff = end - start
print(diff)