In [1]:
import numpy as np
import warnings
from easydict import EasyDict as edict
import numpy as np
import pandas as pd
from utils.data_preprocess_utils import get_all_files, get_config
from easydict import EasyDict as edict
from tqdm import tqdm
import dill
cfg = edict(get_config('./cfg/data_preprocess_cfg.yaml'))

In [2]:
from os import path
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from tqdm import tqdm
import dill

def process_file(file):
    if file.endswith('.csv'):
        df = pd.read_csv(file, usecols=['MMSI'])
        return df['MMSI'].unique()
    else:
        return []

files = get_all_files(cfg.in_dir)
if path.exists(cfg.mmsi_info_dir):
    with open(cfg.mmsi_info_dir, 'rb') as f:
        MMSIs = dill.load(f)
else:
    MMSIs = []

with ThreadPoolExecutor(max_workers=8) as executor:
    results = list(tqdm(executor.map(process_file, files), total=len(files)))

for result in results:
    MMSIs.extend(result)

MMSIs = list(set(MMSIs))

with open(cfg.mmsi_info_dir, 'wb') as f:
    dill.dump(MMSIs, f)


100%|██████████| 1/1 [00:00<00:00,  8.29it/s]


In [3]:
from bs4 import BeautifulSoup
import json
import requests
def get_vessel_info(mmsi:str):
    '''
    Parameters:
        mmsi: MMSI of the vessel

    Returns:
        vessel_type: Pramary type, String,
        vessel_subtype: Pramary scendary type, String
        vessel_length: length (dm), float, Note: 10dm = 1m
        vessel_width: vessel width (dm), float

    Typical usage example:
        type_cal = ['Sailing','Pleasure' ,'Cargo','Fishing','Passenger','Tanker','Tug','SAR','HSC','Dredging','Military']

        mmsi = '538003672'

        vessel_type, vessel_subtype, vessel_length, vessel_width = get_vessel_info(type_cal,mmsi)
        
    '''
    type_cal = ['Sailing','Pleasure' ,'Cargo','Fishing','Passenger','Tanker','Tug','SAR','HSC','Dredging','Military']

    vessel_type = 'Other'
    vessel_subtype = ''
    vessel_length = ''
    vessel_width = ''
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
        "Referer": "https://www.marinetraffic.com/",
        "Accept-Language": "en-US,en;q=0.9",
    }

    data = {'mmsi':mmsi}
    url = f'https://www.marinetraffic.com/en/ais/details/ships/mmsi:{mmsi}'
    
    try :
        response = requests.get(url=url, headers=headers)
    except requests.exceptions.ConnectionError:
        print('ConnectionError')
        return vessel_type, vessel_subtype, vessel_length, vessel_width
    if response.status_code == 400:
        print('Invalid MMSI')
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find_all('meta', {'content': lambda x: x and x.startswith('vessel ')})
        for tag in content:
            if tag.get('data-react-helmet') == 'true' and tag.get('name') == 'keywords':
                keywords = tag.get('content')
                vessel_type = keywords.split(',')[3][1:]
                vessel_type = vessel_type.split(' ')[0]
                vessel_subtype = keywords.split(',')[2][13:]
                if vessel_type not in type_cal:
                    vessel_type = 'Other'
                break

    try: 
        response = requests.post("http://www.shipfinder.com/ship/GetShip", data=data)
    except requests.exceptions.ConnectionError:
        print('ConnectionError')
        return vessel_type, vessel_subtype, vessel_length, vessel_width
    if response.status_code == 200:
        if json.loads(response.text)['data'] == []:
            return vessel_type, vessel_subtype, vessel_length, vessel_width
        result = json.loads(response.text)['data'][0]
        vessel_length = result['length']
        vessel_width = result['width']
        vessel_length = float(vessel_length)
        vessel_width = float(vessel_width)
    return vessel_type, vessel_subtype, vessel_length, vessel_width
    

In [4]:
import os
import re
import json
import concurrent.futures
from tqdm import tqdm


def process_mmsi(mmsi):
    mmsi_str = str(mmsi)
    if not (pattern.match(mmsi_str) and (len(mmsi_str) == 9)):
        return None
    if mmsi_str not in ships_info_dict.keys():
        ships_info_dict[mmsi_str] = {}
        vessel_type, vessel_subtype, vessel_length, vessel_width = get_vessel_info(mmsi_str)
        ships_info_dict[mmsi_str]['vessel_type'] = vessel_type
        ships_info_dict[mmsi_str]['vessel_subtype'] = vessel_subtype
        ships_info_dict[mmsi_str]['vessel_length'] = vessel_length
        ships_info_dict[mmsi_str]['vessel_width'] = vessel_width
    return mmsi_str

if not os.path.exists(cfg.vessels_info_dic_dir):
    ships_info_dict = {}
else:
    with open(cfg.vessels_info_dic_dir) as json_file:
        ships_info_dict = json.load(json_file)

pattern = re.compile(r'^[2-7]')
with open(cfg.mmsi_info_dir, 'rb') as f:
    MMSIs = dill.load(f)

# 使用ThreadPoolExecutor进行多线程处理
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_mmsi, MMSIs), total=len(MMSIs)))

# 将处理完的结果保存到文件
with open(cfg.vessels_info_dic_dir, 'w') as fp:
    json.dump(ships_info_dict, fp)


100%|██████████| 111064/111064 [00:43<00:00, 2542.75it/s]
