In [1]:
import numpy as np
import pandas as pd
import os 

In [2]:
raw_dir = './data/raw_ais_2022/'
raw_ais_file_list = []
for file in os.listdir(raw_dir):
    if file.endswith('zip'):
        raw_ais_file_list.append(file)

print(raw_ais_file_list[:5])

['AIS_2019_01_01.zip', 'AIS_2019_01_02.zip', 'AIS_2019_01_03.zip', 'AIS_2019_01_04.zip', 'AIS_2019_01_05.zip']


In [6]:
from geopy import distance

def within_distance(x):
    threshold=130
    # center at Port of Long Beach 
    anchor = (33.7484, -118.2226)
    new_point = tuple(map(float, x.split(',')))
    
    return distance.distance(anchor, new_point).nautical <= threshold

In [7]:
# source_files = ['./assets/AIS_2019_11_13.csv', './assets/AIS_2019_11_14.csv']
#source_files = ['./assets/test/AIS_2022_01_01.csv']


In [8]:
def process_chunk(chunk):
    
    # we only need cargo and tanker vessels 
    chunk = chunk[chunk['VesselType'].isin(range(70, 90))]
    chunk = chunk[chunk['IMO'].notna()]
    chunk['lat_lon'] = round(chunk['LAT'], 4).astype(str) + ',' + round(chunk['LON'], 4).astype(str)
    chunk = chunk[chunk['lat_lon'].apply(within_distance)]
    chunk = chunk.rename(columns={
        'VesselName':'vessel_name',
        'MMSI':"mmsi",
        'BaseDateTime':'time_seen',
        'LAT':'lat',
        'LON':'lon',
        'SOG':'sog',
        'COG':'cog',
        'Heading':'heading',
        'IMO':'imo',
        'CallSign':'call_sign',
        'VesselType':'vessel_type',
        'Status':'status',
        'Length':'length',
        'Width':'width',
        'Draft':'draft',
        'Cargo':'cargo',
        'TransceiverClass':'transceiver_class'
    })
    return chunk 




Since it takes long time (estimated 9hr) to proess all files, We will firstly filter raw ais files, then persist the results as csv files.  
We then need to read all the csv files, concat them into a single dataframe, then resample base on time. This is essential, if we resample one each file, the time boundary would be wrong. 

In [9]:
chunksize = 10 ** 6
filtered_dir = './data/filtered_ais_2022/'
for ais_file in raw_ais_file_list:

    print('Loading file....{}'.format(ais_file))
    df_chunk = pd.read_csv(os.path.join(raw_dir, ais_file), chunksize=chunksize, compression='zip', on_bad_lines='warn')

    chunks = []
    for chunk in df_chunk:
        processed_chunk = process_chunk(chunk)
        chunks.append(processed_chunk)

    #df_vessel.to_csv('./assets/test/vessels.csv', index=False)
    df_filtered = pd.concat(chunks)
    df_filtered = df_filtered.drop_duplicates()
    df_filtered.to_csv(os.path.join(filtered_dir, ais_file.split('.')[0] + '_filtered.csv'), index=False)

    #df_raw = df_raw.dropna()

Loading file....AIS_2019_07_10.zip
Loading file....AIS_2019_07_11.zip
Loading file....AIS_2019_07_12.zip
Loading file....AIS_2019_07_13.zip
Loading file....AIS_2019_07_14.zip
Loading file....AIS_2019_07_15.zip
Loading file....AIS_2019_07_16.zip
Loading file....AIS_2019_07_17.zip
Loading file....AIS_2019_07_18.zip
Loading file....AIS_2019_07_19.zip
Loading file....AIS_2019_07_20.zip
Loading file....AIS_2019_07_21.zip
Loading file....AIS_2019_07_22.zip
Loading file....AIS_2019_07_23.zip
Loading file....AIS_2019_07_24.zip
Loading file....AIS_2019_07_25.zip
Loading file....AIS_2019_07_26.zip
Loading file....AIS_2019_07_27.zip
Loading file....AIS_2019_07_28.zip
Loading file....AIS_2019_07_29.zip
Loading file....AIS_2019_07_30.zip
Loading file....AIS_2019_07_31.zip
Loading file....AIS_2019_08_01.zip
Loading file....AIS_2019_08_02.zip
Loading file....AIS_2019_08_03.zip
Loading file....AIS_2019_08_04.zip
Loading file....AIS_2019_08_05.zip
Loading file....AIS_2019_08_06.zip
Loading file....AIS_

In [10]:
#filtered_dir = './assets/filtered_ais/'
#df_filtered.to_csv(os.path.join(filtered_dir, ais_file.split('.')[0] + '.csv'), index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b18b33d-3a56-4f49-ad6e-71ecea9f0183' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>