In [1]:
from pymongo import MongoClient

In [2]:
database   = MongoClient()["scraper2"]
containers = database["containers"]
movements  = database["container_movements"]
locations  = database["locations"]

# Datos de la naviera

## Maersk

In [3]:
query = {
    "container": {
        "$regex": "(APMU|COZU|FAAU|FRLU|KNLU|LOTU|MAEU|MALU|MCAU|MCHU|MCRU|MHHU|MIEU|MMAU|MNBU|MRKU|MRSU|MSAU|MSFU|MSKU|MSWU|MVIU|MWCU|MWMU|OCLU|POCU|PONU|SCMU|TORU)"
    }
}
update = {
    "$set": {
        "carrier": "Maersk"
    }
}
result = movements.update_many(query, update)
print(result.matched_count, "matched,", result.modified_count, "modified")

8963 matched, 0 modified


## Hapag-Lloyd

In [4]:
query = {
    "container": {
        "$regex": "(AZLU|CASU|CMUU|CPSU|CSQU|CSVU|FANU|FSCU|HAMU|HLBU|HLCU|HLXU|ITAU|IVLU|LBIU|LNXU|LYKU|MOMU|QIBU|QNNU|TLEU|TMMU|UACU|UAEU|UASU)"
    }
}
update = {
    "$set": {
        "carrier": "Hapag-Lloyd"
    }
}
result = movements.update_many(query, update)
print(result.matched_count, "matched,", result.modified_count, "modified")

53604 matched, 0 modified


## Evergreen

In [5]:
query = {
    "container": {
        "$regex": "(EGHU|EGSU|EISU|EMCU|HMCU|IMTU|LTIU|UGMU)"
    }
}
update = {
    "$set": {
        "carrier": "Evergreen"
    }
}
result = movements.update_many(query, update)
print(result.matched_count, "matched,", result.modified_count, "modified")

2091 matched, 0 modified


## Textainer

In [6]:
query = {
    "container": {
        "$regex": "(AMFU|AMZU|AXIU|CEOU|CHIU|CLHU|GAEU|GATU|GAZU|HCIU|KWCU|LLTU|MAGU|MAXU|MGLU|MLCU|PRSU|TEMU|TENU|TEXU|TGBU|TGHU|TXGU|WCIU|XINU)"
    },
    "processed": True
}
select = {
    "_id": 0,
    "container": 1,
    "carrier": 1
}
cursor = containers.find(query, select)

In [7]:
matched_count  = 0
modified_count = 0
index = 1
for textainer in cursor:
    query2 = {
        "container": textainer["container"]
    }
    update = {
        "$set": {
            "carrier": textainer["carrier"]
        }
    }
    result = movements.update_many(query2, update)
    matched_count += result.matched_count
    modified_count += result.modified_count
    index += 1
print(index, "indexes,", matched_count, "matched,", modified_count, "modified")

1956 indexes, 9081 matched, 0 modified


## Ubicaciones erróneas

In [8]:
query = locations.find({
    "location": {
        "$regex": "^ "
    }
})

bad_locations = list(query)
for location in bad_locations:
    print(location["location"], location["latitude"], location["longitude"])

 Salarno, Italy 46.138917 10.5204375
 SIN 35.000074 104.999927
 Morocco 31.1728205 -7.3362482
 QL 14.9 43.016667
 Russia 64.6863136 97.7453061


In [9]:
for location in bad_locations:
    result = movements.update_many({
        "latitude"  : location["latitude"],
        "longitude" : location["longitude"]
    }, {
        "$unset": {
            "latitude"  : True,
            "longitude" : True
        }
    })
    print(result.matched_count, "matched,", result.modified_count, "modified")

0 matched, 0 modified
1 matched, 1 modified
4 matched, 4 modified
3 matched, 3 modified
2 matched, 2 modified


In [10]:
result = locations.delete_many({
    "location": {
        "$regex": "^ "
    }
})
print(result.deleted_count)

5


## Ubicaciones imprecisas

In [11]:
query_hapag = {
    "location" : "PORT KELANG"
}
query_maersk = {
    "location": {
        "$regex": "Port Klang"
    }
}
insert = {
    "latitude"  : 2.9995164,
    "longitude" : 101.39144825844
}
update = {
    "$set" : {
        "latitude"  : 2.9995164,
        "longitude" : 101.39144825844
    }
}

# Upsert locations
result = locations.update_many(query_maersk, update)
print("Maersk location:", result.matched_count, "matched,", result.modified_count, "modified")
result = locations.update_many(query_hapag, update, upsert = True)
print("Hapag-Lloyd location:", result.matched_count, "matched,", result.modified_count, "modified,",
      result.upserted_id, "upserted")

# Update movements
result = movements.update_many(query_maersk, update)
print("Maersk movements:", result.matched_count, "matched,", result.modified_count, "modified")
result = movements.update_many(query_hapag, update)
print("Hapag-Lloyd movements:", result.matched_count, "matched,", result.modified_count, "modified")

Maersk location: 1 matched, 1 modified
Hapag-Lloyd location: 0 matched, 0 modified, 5cc51e5eac8d26eb3e16ed94 upserted
Maersk movements: 182 matched, 182 modified
Hapag-Lloyd movements: 32 matched, 32 modified


In [12]:
query = {
    "location" : "SAN ANTONIO"
}
update = {
    "$set": {
        "latitude"  : -33.5808615,
        "longitude" : -71.6132377 
    }
}

# Update location
result = locations.update_many(query, update)
print("Location:", result.matched_count, "matched,", result.modified_count, "modified")
# Update movements
result = movements.update_many(query, update)
print("Container movements:", result.matched_count, "matched,", result.modified_count, "modified")

Location: 1 matched, 1 modified
Container movements: 4025 matched, 4025 modified


In [13]:
query = {
    "location" : {
        "$regex" : "Salerno, Salarno, Italy"
    }
}
update = {
    "$set" : {
        "latitude"  : 40.6803601,
        "longitude" : 14.7594542
    }
}

# Upsert location
result = locations.update_many(query, update, upsert = True)
print("Hapag-Lloyd location:", result.matched_count, "matched,", result.modified_count, "modified,",
      result.upserted_id, "upserted")

# Update movements
result = movements.update_many(query, update)
print("Container movements:", result.matched_count, "matched,", result.modified_count, "modified")

Hapag-Lloyd location: 0 matched, 0 modified, 5cc51e5eac8d26eb3e16ed9a upserted
Container movements: 14 matched, 14 modified


## Ubicaciones faltantes

In [14]:
new_locations = [
    {
        "location"  : "St Petersburg FCT, Russia",
        "latitude"  : 59.8737069,
        "longitude" : 30.2197252
    },
    {
        "location"  : "BRISBANE, QL",
        "latitude"  : -27.4689682,
        "longitude" : 153.0234991
    },
    {
        "location"  : "Port Tangier Mediterranee, Morocco",
        "latitude"  : 35.86076335,
        "longitude" : -5.53755836602157
    },
    {
        "location"  : "Nansha New Port, Guangdong, China",
        "latitude"  : 22.76905225,
        "longitude" : 113.60493282976
    },
    {
        "location"  : "PORT OF ITAGUAI",
        "latitude"  : -22.8629597,
        "longitude" : -43.775322
    },
    {
        "location"  : "Hazira, GUJARAT, India",
        "latitude"  : 21.0956055,
        "longitude" : 72.6466021
    },
    {
        "location"  : "Sonepat, HARYANA, India",
        "latitude"  : 29.05587195,
        "longitude" : 76.895369282819
    },
    {
        "location"  : "TEPEJI DEL RIO, HID",
        "latitude"  : 19.89368265,
        "longitude" : -99.3487181526625
    },
    {
        "location"  : "Auckland Metroport, New Zealand",
        "latitude"  : -36.9259849,
        "longitude" : 174.816534034485
    },
    {
        "location"  : "Ambarli Port Istanbul, Turkey",
        "latitude"  : 40.9760031,
        "longitude" : 28.7115604
    },
    {
        "location"  : "Yuzhny, Ukraine",
        "latitude"  : 46.6225879,
        "longitude" : 31.0995364
    },
    {
        "location"  : "PITEAA",
        "latitude"  : 65.3134764,
        "longitude" : 21.4899394
    },
    {
        "location"  : "GARSTON/MERSYD.",
        "latitude"  : 53.3522993,
        "longitude" : -2.8961104
    },
    {
        "location"  : "BOLZANETO-RIONE DE GENOVA",
        "latitude"  : 44.458731,
        "longitude" : 8.9014171
    }
]

for location in new_locations:
    count = locations.count_documents(location)
    if count == 0:
        result = locations.insert_one(location)
        print(location["location"], "inserted")

St Petersburg FCT, Russia inserted
BRISBANE, QL inserted
Port Tangier Mediterranee, Morocco inserted
Nansha New Port, Guangdong, China inserted
PORT OF ITAGUAI inserted
Hazira, GUJARAT, India inserted
Sonepat, HARYANA, India inserted
TEPEJI DEL RIO, HID inserted
Auckland Metroport, New Zealand inserted
Ambarli Port Istanbul, Turkey inserted
Yuzhny, Ukraine inserted
PITEAA inserted
GARSTON/MERSYD. inserted
BOLZANETO-RIONE DE GENOVA inserted


In [15]:
pending = []
cursor = movements.distinct("location", {
    "latitude"  : None,
    "longitude" : None
})
for raw_location in cursor:
    # Get location query
    location_query = {
        "location": raw_location.split("\n")[-1]
    }
    # Check if location exists in database
    location = locations.find_one(location_query)
    if location:
        # Already in database, replace movements
        print(location["location"], "already in database, replacing:")
        movement_query = {
            "location": {
                "$regex" : location["location"]
            }
        }
        movement_update = {
            "$set" : {
                "latitude"  : location["latitude"],
                "longitude" : location["longitude"]
            }
        }
        result = movements.update_many(movement_query, movement_update)
        print(result.matched_count, "matched,", result.modified_count, "modified")
    else:
        pending.append(location_query["location"])

PORTLAND, OR already in database, replacing:
4 matched, 1 modified
BRISBANE, QL already in database, replacing:
11 matched, 11 modified
PORT OF ITAGUAI already in database, replacing:
9 matched, 9 modified
Ambarli Port Istanbul, Turkey already in database, replacing:
42 matched, 42 modified
Onne, Nigeria already in database, replacing:
3 matched, 1 modified
St Petersburg FCT, Russia already in database, replacing:
75 matched, 75 modified
Port Tangier Mediterranee, Morocco already in database, replacing:
12 matched, 12 modified
Yuzhny, Ukraine already in database, replacing:
19 matched, 19 modified
Ambarli Port Istanbul, Turkey already in database, replacing:
42 matched, 0 modified
MAZATLAN, SIN already in database, replacing:
83 matched, 1 modified
PITEAA already in database, replacing:
20 matched, 20 modified
GARSTON/MERSYD. already in database, replacing:
8 matched, 8 modified
Auckland Metroport, New Zealand already in database, replacing:
16 matched, 16 modified
TEPEJI DEL RIO, HID 

In [16]:
pending

['WORLD WIDE DUMMY LOCATION EQUIPMENT']