In [1]:
from pymongo import MongoClient

In [9]:
database   = MongoClient()["scraper2"]
containers = database["containers"]
movements  = database["container_movements"]
locations  = database["locations"]

# Datos de la naviera

## Maersk

In [28]:
query = {
    "container": {
        "$regex": "(APMU|COZU|FAAU|FRLU|KNLU|LOTU|MAEU|MALU|MCAU|MCHU|MCRU|MHHU|MIEU|MMAU|MNBU|MRKU|MRSU|MSAU|MSFU|MSKU|MSWU|MVIU|MWCU|MWMU|OCLU|POCU|PONU|SCMU|TORU)"
    }
}
update = {
    "$set": {
        "carrier": "Maersk"
    }
}
result = movements.update_many(query, update)
print(result.matched_count, "matched,", result.modified_count, "modified")

3896 matched, 0 modified


## Hapag-Lloyd

In [29]:
query = {
    "container": {
        "$regex": "(AZLU|CASU|CMUU|CPSU|CSQU|CSVU|FANU|FSCU|HAMU|HLBU|HLCU|HLXU|ITAU|IVLU|LBIU|LNXU|LYKU|MOMU|QIBU|QNNU|TLEU|TMMU|UACU|UAEU|UASU)"
    }
}
update = {
    "$set": {
        "carrier": "Hapag-Lloyd"
    }
}
result = movements.update_many(query, update)
print(result.matched_count, "matched,", result.modified_count, "modified")

22600 matched, 0 modified


## Evergreen

In [30]:
query = {
    "container": {
        "$regex": "(EGHU|EGSU|EISU|EMCU|HMCU|IMTU|LTIU|UGMU)"
    }
}
update = {
    "$set": {
        "carrier": "Evergreen"
    }
}
result = movements.update_many(query, update)
print(result.matched_count, "matched,", result.modified_count, "modified")

1671 matched, 0 modified


## Textainer

In [31]:
query = {
    "container": {
        "$regex": "(AMFU|AMZU|AXIU|CEOU|CHIU|CLHU|GAEU|GATU|GAZU|HCIU|KWCU|LLTU|MAGU|MAXU|MGLU|MLCU|PRSU|TEMU|TENU|TEXU|TGBU|TGHU|TXGU|WCIU|XINU)"
    },
    "processed": True
}
select = {
    "_id": 0,
    "container": 1,
    "carrier": 1
}
cursor = containers.find(query, select)

In [32]:
matched_count  = 0
modified_count = 0
index = 1
for textainer in cursor:
    query2 = {
        "container": textainer["container"]
    }
    update = {
        "$set": {
            "carrier": textainer["carrier"]
        }
    }
    result = movements.update_many(query2, update)
    matched_count += result.matched_count
    modified_count += result.modified_count
    index += 1
print(index, "indexes,", matched_count, "matched,", modified_count, "modified")

1035 indexes, 4432 matched, 0 modified


## Ubicaciones erróneas

In [10]:
query = locations.find({
    "location": {
        "$regex": "^ "
    }
})

bad_locations = list(query)
for location in bad_locations:
    print(location)

{'_id': ObjectId('5cb8385dac8d26eb3e104f62'), 'location': ' Salarno, Italy', 'latitude': 46.138917, 'longitude': 10.5204375}
{'_id': ObjectId('5cb8681cac8d26eb3e106938'), 'location': ' SIN', 'latitude': 35.000074, 'longitude': 104.999927}
{'_id': ObjectId('5cb87aefac8d26eb3e106ff1'), 'location': ' Morocco', 'latitude': 31.1728205, 'longitude': -7.3362482}
{'_id': ObjectId('5cb8ed73ac8d26eb3e10a72d'), 'location': ' QL', 'latitude': 14.9, 'longitude': 43.016667}
{'_id': ObjectId('5cba016cac8d26eb3e116618'), 'location': ' Russia', 'latitude': 64.6863136, 'longitude': 97.7453061}


In [7]:
for location in bad_locations:
    result = movements.update_many({
        "latitude"  : location["latitude"],
        "longitude" : location["longitude"]
    }, {
        "$unset": {
            "latitude"  : True,
            "longitude" : True
        }
    })
    print(result.matched_count, "matched,", result.modified_count, "modified")

0 matched, 0 modified
1 matched, 1 modified
4 matched, 4 modified
3 matched, 3 modified
2 matched, 2 modified


In [11]:
result = locations.delete_many({
    "location": {
        "$regex": "^ "
    }
})
print(result.deleted_count)

5


## Ubicaciones imprecisas

In [20]:
query_hapag = {
    "location" : "PORT KELANG"
}
query_maersk = {
    "location": {
        "$regex": "Port Klang"
    }
}
insert = {
    "latitude"  : 2.9995164,
    "longitude" : 101.39144825844
}
update = {
    "$set" : {
        "latitude"  : 2.9995164,
        "longitude" : 101.39144825844
    }
}

# Upsert locations
result = locations.update_many(query_maersk, update)
print("Maersk location:", result.matched_count, "matched,", result.modified_count, "modified")
result = locations.update_many(query_hapag, update, upsert = True)
print("Hapag-Lloyd location:", result.matched_count, "matched,", result.modified_count, "modified,",
      result.upserted_id, "upserted")

# Update movements
result = movements.update_many(query_maersk, update)
print("Maersk movements:", result.matched_count, "matched,", result.modified_count, "modified")
result = movements.update_many(query_hapag, update)
print("Hapag-Lloyd movements:", result.matched_count, "matched,", result.modified_count, "modified")

Maersk location: 1 matched, 0 modified
Hapag-Lloyd location: 1 matched, 0 modified, None upserted
Maersk movements: 182 matched, 0 modified
Hapag-Lloyd movements: 28 matched, 0 modified


In [12]:
query = {
    "location" : "SAN ANTONIO"
}
update = {
    "$set": {
        "latitude"  : -33.5808615,
        "longitude" : -71.6132377 
    }
}

# Update location
result = locations.update_many(query, update)
print("Location:", result.matched_count, "matched,", result.modified_count, "modified")
# Update movements
result = movements.update_many(query, update)
print("Container movements:", result.matched_count, "matched,", result.modified_count, "modified")

Location: 1 matched 1 modified
Container movements: 3624 matched 3624 modified


In [21]:
query = {
    "location" : {
        "$regex" : "Salerno, Salarno, Italy"
    }
}
update = {
    "$set" : {
        "latitude"  : 40.6803601,
        "longitude" : 14.7594542
    }
}

# Upsert location
result = locations.update_many(query, update, upsert = True)
print("Hapag-Lloyd location:", result.matched_count, "matched,", result.modified_count, "modified,",
      result.upserted_id, "upserted")

# Update movements
result = movements.update_many(query, update)
print("Container movements:", result.matched_count, "matched,", result.modified_count, "modified")

Hapag-Lloyd location: 0 matched, 0 modified, 5cc0fc067ea5731a6955e109 upserted
Container movements: 14 matched, 14 modified


## Ubicaciones faltantes

In [34]:
pending = []
cursor = movements.distinct("location", {
    "latitude"  : None,
    "longitude" : None
})
for raw_location in cursor:
    # Get location query
    location_query = {
        "location": raw_location.split("\n")[-1]
    }
    # Check if location exists in database
    location = locations.find_one(location_query)
    if location:
        # Already in database, replace movements
        print(location["location"], "already in database, replacing:")
        movement_query = {
            "location": {
                "$regex" : location["location"]
            }
        }
        movement_update = {
            "$set" : {
                "latitude"  : location["latitude"],
                "longitude" : location["longitude"]
            }
        }
        result = movements.update_many(movement_query, movement_update)
        print(result.matched_count, "matched,", result.modified_count, "modified")
    else:
        pending.append(location_query["location"])

In [35]:
pending

['BRISBANE, QL',
 'PORT OF ITAGUAI',
 'Ambarli Port Istanbul, Turkey',
 'WORLD WIDE DUMMY LOCATION EQUIPMENT',
 'St Petersburg FCT, Russia',
 'Port Tangier Mediterranee, Morocco',
 'Yuzhny, Ukraine',
 'Ambarli Port Istanbul, Turkey',
 'PITEAA',
 'GARSTON/MERSYD.',
 'Auckland Metroport, New Zealand',
 'TEPEJI DEL RIO, HID',
 'Hazira, GUJARAT, India',
 'Sonepat, HARYANA, India',
 'Nansha New Port, Guangdong, China']