In [1]:
from pymongo import MongoClient, ASCENDING, DESCENDING
from datetime import datetime

import pandas as pd
import numpy as np
import os
import csv

In [9]:
class TrackingPreprocessor:
    
    # TODO: No hardcodear esto
    CARRIERS = {
        "Maersk"      : 1,
        "Hapag-Lloyd" : 2,
        "Evergreen"   : 3
    }
    VEHICLES = {
        "Vessel" : 1,
        "Truck"  : 2,
        "Train"  : 3
    }
    
    def __init__(self):
        self.scraper_database   = MongoClient()["scraper2"]
        self.scraper_containers = self.scraper_database["containers"]
        self.scraper_movements  = self.scraper_database["container_movements"]
        self.scraper_statuses   = self.scraper_database["container_statuses"]
        self.preprocessing_date = datetime.now().strftime("%Y%m%d")
    
    def evaluate_carrier(self, carrier):
        finished, repeated, missing, estimated, single, empty, incoherent = [], [], [], [], [], [], []
        
        # Iterate through container query
        for container in self.query_containers(carrier):
            movements = list(self.query_movements(container))
            
            # Case no movements found
            if len(movements) == 0:
                empty.append(container)
                continue
            # Case only one movement found
            if len(movements) == 1:
                self.save_one_movement(single, container, movements[0])
                continue
            
            # Get first and last movements
            first = movements[0]
            last  = movements[-1]
            # This case should never happen
            if first["estimated"] == True:
                self.save_movements(incoherent, container, first, last)
                continue
            # Case last movement is estimated
            if last["estimated"] == True:
                self.save_movements(estimated, container, first, last)
                continue
            # Case both movements contain real data
            self.preprocess_movements(finished, repeated, missing, container, carrier, first, last)
        
        # Create directory
        directory = self.create_parent_directory()
        
        # Save containers ready for training
        if len(finished) > 0:
            print("-", len(finished), "containers with finished movements")
            self.save_to_csv(finished, directory, carrier, "train")
        # Save containers with same locations in both extremes
        if len(repeated) > 0:
            print("-", len(repeated), "containers with finished movements but same locations")
            self.save_to_csv(repeated, directory, carrier, "same-locations")
        # Save containers with missing locations
        if len(missing) > 0:
            print("-", len(missing), "containers with finished movements but missing locations")
            self.save_to_csv(missing, directory, carrier, "missing-locations")
        
        # Save containers with estimated movements
        if len(estimated) > 0:
            print("-", len(estimated), "containers with estimated movements")
            self.save_to_csv(estimated, directory, carrier, "estimated")
        # Save containers with only one movement
        if len(single) > 0:
            print("-", len(single), "containers with only one movement")
            self.save_to_csv(single, directory, carrier, "single")
        # Save empty containers
        if len(empty) > 0:
            print("-", len(empty), "containers with no movements")
            self.save_to_text(empty, directory, carrier, "empty")
        
        # Save incoherent containers
        if len(incoherent) > 0:
            print("-", len(incoherent), "incoherent containers found!")
            self.save_to_csv(incoherent, directory, carrier, "incoherent")
    
    def query_containers(self, carrier):
        query = {
            "carrier"   : carrier,
            "processed" : True
        }
        return self.scraper_containers.distinct("container", query)
    
    def query_movements(self, container):
        query = {
            "container" : container
        }
        sort = [
            ("date", ASCENDING),
            ("_id",  ASCENDING)
        ]
        return self.scraper_movements.find(query).sort(sort)
    
    def save_one_movement(self, movements, container, movement):
        # Write header
        if len(movements) == 0:
            movements.append(["container", "date", "status", "location"])
        # Write content
        movements.append([container, movement["date"], movement["status"], movement["location"]])
    
    def save_movements(self, movements, container, first_movement, last_movement):
        # Write header
        if len(movements) == 0:
            movements.append(["container",
                              "first_date",
                              "first_location",
                              "first_status",
                              "last_date",
                              "last_location",
                              "last_status"])
        # Write content
        movements.append([container,
                          first_movement["date"],
                          first_movement["location"],
                          first_movement["status"],
                          last_movement["date"],
                          last_movement["location"],
                          last_movement["status"]])
    
    def preprocess_movements(self, finished, repeated, missing, container, carrier, first, last):
        # Check if both movements are in the same location
        if first["location"] == last["location"]:
            self.save_movements(repeated, container, first, last)
            return
        
        # Check if both movements have geocodes
        if ("latitude" not in first) or ("longitude" not in first):
            self.save_one_movement(missing, container, first)
            return
        if ("latitude" not in last) or ("longitude" not in last):
            self.save_one_movement(missing, container, last)
            return
        
        # Write header
        if len(finished) == 0:
            finished.append(["container",
                             "carrier",
                             "timedelta",
                             "first_date",
                             "first_status",
                             "first_status_code",
                             "first_location",
                             "first_latitude",
                             "first_longitude",
                             "first_vehicle",
                             "last_date",
                             "last_status",
                             "last_status_code",
                             "last_location",
                             "last_latitude",
                             "last_longitude",
                             "last_vehicle"])
        # Write content
        finished.append([
            # General information
            container,
            self.CARRIERS.get(carrier, 0),
            self.get_elapsed_days(first, last),
            # First container information
            first["date"],
            first["status"],
            self.get_status_code(first, carrier),
            first["location"],
            first["latitude"],
            first["longitude"],
            self.get_vehicle_code(first),
            # Last container information
            last["date"],
            last["status"],
            self.get_status_code(last, carrier),
            last["location"],
            last["latitude"],
            last["longitude"],
            self.get_vehicle_code(last)
        ])
    
    def get_elapsed_days(self, first_movement, last_movement):
        timedelta = last_movement["date"] - first_movement["date"]
        return timedelta.days + timedelta.seconds/(3600*24)
    
    def get_status_code(self, movement, carrier):
        if "status_code" in movement:
            return int(movement["status_code"])
        
        # Lookup in database
        query = {
            carrier : movement["status"]
        }
        result = self.scraper_statuses.find_one(query)
        
        # Return code as integer
        return int(result["code"]) if result else 0
    
    def get_vehicle_code(self, movement):
        if "vehicle" not in movement:
            return 0
        if "vehicle_code" in movement:
            return movement["vehicle_code"]
        
        # Lookup in enumeration
        return self.VEHICLES.get(movement["vehicle"], 0)
    
    def create_parent_directory(self):
        directory = "preprocess-{}".format(datetime.now().strftime("%Y%m%d"))
        if not os.path.exists(directory):
            os.mkdir(directory)
        return directory
    
    def save_to_csv(self, movements, directory, carrier, category):
        # Get filename
        filename = "{}/{}-{}.csv".format(directory, carrier, category)
        # Write CSV
        with open(filename, "w", newline = "") as file:
            writer = csv.writer(file)
            writer.writerows(movements)
    
    def save_to_text(self, containers, directory, carrier, category):
        # Get filename
        filename = "{}/{}-{}.txt".format(directory, carrier, category)
        # Write text file
        with open(filename, "w") as file:
            for container in containers:
                file.write(container + "\n")

## Maersk

In [11]:
preprocessor = TrackingPreprocessor()

In [7]:
preprocessor.evaluate_carrier("Maersk")

- 608 containers with finished movements
- 30 containers with finished movements but missing locations
- 374 containers with estimated movements
- 5694 containers with no movements


## Evergreen

In [5]:
preprocessor.evaluate_carrier("Evergreen")

- 70 containers with finished movements
- 533 containers with finished movements but same locations
- 1493 containers with only one movement


## Hapag-Lloyd

In [12]:
preprocessor.evaluate_carrier("Hapag-Lloyd")

- 3018 containers with finished movements
- 5 containers with finished movements but same locations
- 13 containers with finished movements but missing locations
- 2246 containers with estimated movements
- 45 containers with no movements
- 25 incoherent containers found!
