This notebook is used for downloading, extracting, and subsampling the Automatic Identification System (AIS) data, or Vessel traffic data, which are collected by the U.S. Coast Guard through an onboard navigation https://marinecadastre.gov/AIS/. 

The year data is divided into UTMZoneMap https://marinecadastre.gov/AIS/AIS%20Documents/UTMZoneMap2014.png.
The data is provided for years 2009-2020 divided into 19 UTM Zones.

For more information about specifc parameters:
https://help.marinetraffic.com/hc/en-us/articles/205426887-What-kind-of-information-is-AIS-transmitted-

VesselType: Passenger, Cargo, Tanker, etc.
https://help.marinetraffic.com/hc/en-us/articles/205579997-What-is-the-significance-of-the-AIS-Shiptype-number-

Status Values: sailing, moored etc..
https://help.marinetraffic.com/hc/en-us/articles/203990998-What-is-the-significance-of-the-AIS-Navigational-Status-Values

In [88]:
import wget
import os
import pandas as pd 
from os import walk
from glob import glob
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import zipfile
from datetime import datetime
from pathlib import Path

In [None]:
def check_dir(year):
    history =[]
    for path, dirs, files in os.walk(str(year)):
        for file in files:
            if file.endswith('.csv'):
                history.append(file.split('.')[0])
    return history

In [None]:
def download_AIS(year, zones):
    Path(str(year)).mkdir(parents=True, exist_ok=True)
    resume_download = check_dir(str(year))
    url = "https://coast.noaa.gov/htdata/CMSP/AISDataHandler/{0}/".format(year)
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    files = []
    for a in soup.find_all('a', href=True):
         if a.text and a.text.endswith('zip'):
            name, _ = a['href'].split('.')
            l = name.split('_')
            l.append(a.text)
            files.append(l) 

    df = pd.DataFrame(files)
    df.columns = [*df.columns[:-1], 'Files']
  
    for c in df.columns:
        if c == 'Files': continue
        unique_col = len(df[c].unique())
        if unique_col == 12 or unique_col == 6: # some years provid data only for 6 month
            df['Month'] = df[c]
        elif unique_col == 31:
            df['Days'] = df[c]
        elif 20 >= unique_col >= 18 or 'zone' in df[c][0].lower():
            if 'zone' in df[c][0].lower():
                df['Zone'] = pd.to_numeric([z[z.lower().find('zone')+4:]for z in df[c]])
            else:
                df['Zone'] = pd.to_numeric(df[c])
        del df[c]
    
    #  download
    if 'Zone' in df.columns:
        dl = df[df['Zone'].isin(zones)][['Files','Zone']]
    else:
        dl = df[['Files','Month']]
    for file, zone in tqdm(dl.values.tolist()):
        output = '%s_%s' % (year, str(zone))
        Path(str(year)).joinpath(output).mkdir(parents=True, exist_ok=True)
        if file.split('.')[0] in resume_download: continue
        print(file)
        wget.download(os.path.join(url,file)) 
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(str(year), output))
        os.remove(file)   

In [101]:
def show_plotly(dataframe):
    import plotly.express as px
    fig = px.scatter_mapbox(dataframe,
                        lat=dataframe.LAT,
                        lon=dataframe.LON, color='SOG', mapbox_style="stamen-toner")
    fig.update_geos(
        lataxis_range=[dataframe.LAT.min(),dataframe.LAT.max()], lonaxis_range=[dataframe.LON.min(), dataframe.LON.max()]
    )
    fig.show()

In [104]:
def subset_AIS_to_CSV(year, min_time_interval = 60):
    data_list = []
    for path, dirs, files in os.walk(str(year)):
        for file in files:
                if file.endswith('.csv'):
                    x = os.path.join(path, file)
                    print(x)
                    df = pd.read_csv(x)
                    df = df.drop(['MMSI', 'VesselName', 'CallSign', 'Cargo', 'TranscieverClass'], axis=1, errors='ignore')
                    df = df.dropna()
                    df = df.query('(Status == "under way using engine" or Status == "under way sailing" or  Status == 8 or  Status == 0) & 79 >= VesselType >= 70 & SOG > 3 & Length > 3 & Width > 3 & Draft > 3 ')
                    
                    # parse utc time
                    df.index = pd.to_datetime(df.BaseDateTime) # 2017-01-01T01:30:10
                    df = df.resample("%dmin" % min_time_interval).first()
                    
                    data_list.extend(df.values)

    df = pd.DataFrame(data_list, columns=['BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading',  'IMO', 'VesselType', 'Status', 'Length','Width', 'Draft'])
    df = df.dropna()
    df.to_csv(os.path.join(str(year), '%s_%.2dM.csv' % (str(year), min_time_interval)))

In [None]:
download_AIS(2020, zones=list(range(4,10))) # zone 4 -> 9 

In [106]:
subset_AIS_to_CSV(year='2016', min_time_interval = 60) # subsampling and convert downloaded data into one csv   

2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_01_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_02_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_03_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_04_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_05_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_06_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_07_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_08_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_09_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_10_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_11_Zone04.csv
2016\2016_Z04\AIS_ASCII_by_UTM_Month\2016\AIS_2016_12_Zone04.csv
2016\2016_Z05\AIS_ASCII_by_UTM_Month\2016\AIS_2016_01_Zone05.csv
2016\2016_Z05\AIS_ASCII_by_UTM_Month\2016\AIS_2016_02_Zone05.csv
2016\2016_Z05\AIS_ASCII_by_UTM_Month\2016\AIS_2016_03_Zone05.csv
2016\2016_Z05\AIS_ASCII_b

In [107]:
df = pd.read_csv('2016/2016_20M.csv')
show_plotly(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11857 entries, 0 to 11856
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    11857 non-null  int64  
 1   BaseDateTime  11857 non-null  object 
 2   LAT           11857 non-null  float64
 3   LON           11857 non-null  float64
 4   SOG           11857 non-null  float64
 5   COG           11857 non-null  float64
 6   Heading       11857 non-null  float64
 7   IMO           11857 non-null  object 
 8   VesselType    11857 non-null  float64
 9   Status        11857 non-null  object 
 10  Length        11857 non-null  float64
 11  Width         11857 non-null  float64
 12  Draft         11857 non-null  float64
dtypes: float64(9), int64(1), object(3)
memory usage: 1.2+ MB


In [86]:
df

Unnamed: 0.1,Unnamed: 0,BaseDateTime,LAT,LON,SOG,COG,Heading,IMO,VesselType,Status,Length,Width,Draft
0,0,2016-01-01T14:59:41,53.98027,-161.95992,6.9,108.9,92.0,IMO9507960,70.0,under way using engine,225.0,32.0,9.3
1,1,2016-01-01T15:05:41,53.97840,-161.94218,6.3,88.0,89.0,IMO9507960,70.0,under way using engine,225.0,32.0,9.3
2,2,2016-01-01T16:14:02,53.95210,-161.74760,5.4,102.0,85.0,IMO9507960,70.0,under way using engine,225.0,32.0,9.3
3,21,2016-01-02T11:09:52,54.04198,-161.56713,19.7,-135.8,277.0,IMO9215646,79.0,under way using engine,278.0,40.0,10.2
4,56,2016-01-03T22:22:59,53.96363,-161.84836,18.3,-114.6,299.0,IMO9347578,70.0,under way using engine,294.0,32.0,12.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4557,23038,2016-12-31T16:02:28,46.19074,-126.36199,13.2,88.3,89.0,IMO9643958,70.0,under way using engine,180.0,30.0,6.5
4558,23039,2016-12-31T17:03:59,46.16412,-126.08425,4.8,73.1,65.0,IMO9643958,70.0,under way using engine,180.0,30.0,6.5
4559,23043,2016-12-31T21:30:13,37.56835,-126.09695,15.4,-137.6,511.0,IMO9291133,70.0,under way using engine,199.0,32.0,8.1
4560,23044,2016-12-31T22:10:07,37.56517,-126.30757,15.0,-133.6,511.0,IMO9291133,70.0,under way using engine,199.0,32.0,8.1
