In [1]:
from pymongo import MongoClient, ASCENDING, DESCENDING

import pandas as pd
import numpy as np

In [2]:
class TrackingPreprocessor:
    
    # TODO: No hardcodear esto
    CARRIERS = {
        "Maersk"      : 1,
        "Hapag-Lloyd" : 2,
        "Evergreen"   : 3
    }
    VEHICLES = {
        "Vessel" : 1,
        "Truck"  : 2,
        "Train"  : 3
    }
    
    def __init__(self):
        self.scraper_database   = MongoClient()["scraper2"]
        self.scraper_containers = self.scraper_database["containers"]
        self.scraper_movements  = self.scraper_database["container_movements"]
        self.scraper_statuses   = self.scraper_database["container_statuses"]
    
    def evaluate_carrier(self, carrier):
        preprocessed, estimated, one_movement, no_movements, incoherent = [], [], [], [], []
        
        # Iterate through container query
        for container in self.query_containers(carrier):
            movements = list(self.query_movements(container))
            
            # Case no movements found
            if len(movements) == 0:
                no_movements.append(container)
                continue
            # Case only one movement found
            first_movement = movements[0]
            if len(movements) == 1:
                self.save_one_movement(one_movement, container, first_movement)
                continue
            
            # Get first and last movements
            last_movement  = movements[-1]
            # Case last movement is estimated (first one doesn't matter)
            if last_movement["estimated"] == True:
                self.save_movements(estimated, container, first_movement, last_movement)
                continue
            # Case both movements contain real data
            if first_movement["estimated"] == False:
                self.preprocess_movements(preprocessed, container, carrier, first_movement, last_movement)
                continue
            
            # This case should never happen
            self.save_movements(incoherent, container, first_movement, last_movement)
        
        # Return values after preprocessing
        return preprocessed, estimated, one_movement, no_movements, incoherent
    
    def query_containers(self, carrier):
        query = {
            "carrier"   : carrier,
            "processed" : True
        }
        return self.scraper_containers.distinct("container", query)
    
    def query_movements(self, container):
        query = {
            "container" : container
        }
        return self.scraper_movements.find(query).sort([("date", ASCENDING), ("_id", ASCENDING)])
    
    def save_one_movement(self, movements, container, movement):
        movements.append({
            "container" : container,
            "date"      : movement["date"],
            "status"    : movement["status"],
            "location"  : movement["location"]
        })
    
    def save_movements(self, movements, container, first_movement, last_movement):
        movements.append({
            "container"      : container,
            "first_date"     : first_movement["date"],
            "first_location" : first_movement["location"],
            "first_status"   : first_movement["status"],
            "last_date"      : last_movement["date"],
            "last_location"  : last_movement["location"],
            "last_status"    : last_movement["status"]
        })
    
    def preprocess_movements(self, movements, container, carrier, first_movement, last_movement):
        movements.append({
        # General information
            "container"       : container,
            "carrier"         : self.CARRIERS.get(carrier, 0),
            "elapsed_days"    : self.get_elapsed_days(first_movement, last_movement),
            # First container information
            "first_date"        : first_movement["date"],
            "first_status"      : first_movement["status"],
            "first_status_code" : self.get_status_code(first_movement, carrier),
            "first_location"    : first_movement["location"],
            "first_latitude"    : first_movement.get("latitude", None),
            "first_longitude"   : first_movement.get("longitude", None),
            "first_vehicle"     : self.get_vehicle_code(first_movement),
            # Last container information
            "last_date"         : last_movement["date"],
            "last_status"       : last_movement["status"],
            "last_status_code"  : self.get_status_code(last_movement, carrier),
            "last_location"     : last_movement["location"],
            "last_latitude"     : last_movement.get("latitude", None),
            "last_longitude"    : last_movement.get("longitude", None),
            "last_vehicle"      : self.get_vehicle_code(last_movement)
        })
    
    def get_elapsed_days(self, first_movement, last_movement):
        timedelta = last_movement["date"] - first_movement["date"]
        return timedelta.days + timedelta.seconds/(3600*24)
    
    def get_status_code(self, movement, carrier):
        if "status_code" in movement:
            return int(movement["status_code"])
        # Lookup in database
        query = {
            carrier: movement["status"]
        }
        result = self.scraper_statuses.find_one(query)
        return int(result["code"]) if result else 0
    
    def get_vehicle_code(self, movement):
        if "vehicle" not in movement:
            return 0
        if "vehicle_code" in movement:
            return movement["vehicle_code"]
        # Lookup in enumeration
        return self.VEHICLES.get(movement["vehicle"], 0)

In [3]:
preprocessor = TrackingPreprocessor()

## Maersk

In [4]:
maersk = preprocessor.evaluate_carrier("Maersk")

In [5]:
maersk_train = pd.DataFrame(maersk[0])
maersk_train.to_csv("preprocess/maersk-train-20190419.csv")

In [6]:
maersk_estimated = pd.DataFrame(maersk[1])
maersk_estimated.to_csv("preprocess/maersk-estimated-20190419.csv")

In [7]:
with open("preprocess/maersk-empty-20190419.txt", "w") as file:
    for container in maersk[3]:
        file.write(container + "\n")

## Evergreen

In [8]:
evergreen = preprocessor.evaluate_carrier("Evergreen")

In [9]:
evergreen_train = pd.DataFrame(evergreen[0])
evergreen_train.to_csv("preprocess/evergreen-multiple-20190419.csv")

In [10]:
evergreen_single = pd.DataFrame(evergreen[2])
evergreen_single.to_csv("preprocess/evergreen-single-20190419.csv")

## Hapag-Lloyd

In [11]:
hapaglloyd = preprocessor.evaluate_carrier("Hapag-Lloyd")

In [12]:
hapaglloyd_train = pd.DataFrame(hapaglloyd[0])
hapaglloyd_train.to_csv("preprocess/hapaglloyd-train-20190419.csv")

In [13]:
hapaglloyd_estimated = pd.DataFrame(hapaglloyd[1])
hapaglloyd_estimated.to_csv("preprocess/hapaglloyd-estimated-20190419.csv")

In [14]:
with open("preprocess/hapaglloyd-empty-20190419.txt", "w") as file:
    for container in hapaglloyd[3]:
        file.write(container + "\n")