In [1]:
from pymongo import MongoClient

import pandas as pd
import numpy as np

In [2]:
database         = MongoClient()["scraper2"]
containers_table = database["containers"]
movements_table  = database["container_movements"]
prefixes_table   = database["container_prefixes"]
statuses_table   = database["statuses"]

In [3]:
select = {
    "_id": 0,
    "container": 1,
    "date": 1,
    "location": 1,
    "latitude": 1,
    "longitude": 1,
    "status": 1,
    "estimated": 1,
    "transport_type": 1
}

## Obtener contenedores y movimientos

In [6]:
containers = containers_table.distinct("container", {"processed": True})
len(containers)

3346

In [7]:
select = {
    "container": 1,
    "carrier": 1,
    "date": 1,
    "location": 1,
    "latitude": 1,
    "longitude": 1,
    "status": 1,
    "estimated": 1,
    "transport_type": 1
}

## Preprocesar datos

In [8]:
def preprocess(movement, container):
    # Get vehicle type as number
    transport_type = movement.get("transport_type")
    if transport_type == "Vessel":
        movement["vehicle"] = 1
    elif transport_type == "Truck":
        movement["vehicle"] = 2
    else:
        movement["vehicle"] = 0
    
    # Get status type as number
    status = statuses_table.find_one({
        movement["carrier"]: movement["status"]
    })
    movement["status_code"] = int(status["code"]) if status else 0
    
    return movement

In [9]:
movements = []
for container in containers:
    partial_movements = list(movements_table.find({"container": container}, select))
    if len(partial_movements) <= 1:
        continue
    for movement in partial_movements:
        movements.append(preprocess(movement, container))
len(movements)

8907

## Dataframe preprocfesado

In [10]:
data = pd.DataFrame(movements).drop(["_id", "status", "transport_type", "location", "carrier"], 1)

In [11]:
data.head()

Unnamed: 0,container,date,estimated,latitude,longitude,status_code,vehicle
0,TEMU9040627,2019-04-12 21:56:00,False,-33.045846,-71.619675,1,2
1,TEMU9040627,2019-04-17 21:00:00,True,-33.045846,-71.619675,0,1
2,TEMU9040627,2019-05-10 01:00:00,True,22.279328,114.162813,0,1
3,TGHU0538924,2019-03-05 16:21:00,False,25.798845,-100.372833,1,2
4,TGHU0538924,2019-03-11 17:59:00,False,19.127657,-104.284126,2,2


In [12]:
data.tail()

Unnamed: 0,container,date,estimated,latitude,longitude,status_code,vehicle
8902,FSCU5670046,2019-04-07 09:25:00,False,36.846292,-76.292925,7,1
8903,FSCU5670046,2019-03-12 07:33:00,False,51.22111,4.399708,1,2
8904,EGSU9089973,2019-03-07 00:00:00,False,-12.066667,-77.15,3,1
8905,EGSU9089973,2019-04-10 00:00:00,False,22.350627,114.184916,9,1
8906,EGSU9089973,2019-04-11 00:00:00,False,23.182451,113.476086,7,1
