This notebook is used for downloading, extracting, and subsampling the Automatic Identification System (AIS) data, or Vessel traffic data, which are collected by the U.S. Coast Guard through an onboard navigation https://marinecadastre.gov/AIS/. 

The year data is divided into UTMZoneMap https://marinecadastre.gov/AIS/AIS%20Documents/UTMZoneMap2014.png.
The data is provided for years 2009-2020 divided into 19 UTM Zones.

For more information about specifc parameters:
https://help.marinetraffic.com/hc/en-us/articles/205426887-What-kind-of-information-is-AIS-transmitted-

VesselType: Passenger, Cargo, Tanker, etc.
https://help.marinetraffic.com/hc/en-us/articles/205579997-What-is-the-significance-of-the-AIS-Shiptype-number-
https://coast.noaa.gov/data/marinecadastre/ais/VesselTypeCodes2018.pdf

Status Values: sailing, moored etc..
https://help.marinetraffic.com/hc/en-us/articles/203990998-What-is-the-significance-of-the-AIS-Navigational-Status-Values

In [None]:
import wget
import os
import pandas as pd 
from os import walk
from glob import glob
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import zipfile
from datetime import datetime, timedelta
from pathlib import Path

In [None]:
def check_dir(year):
    history =[]
    for path, dirs, files in os.walk(str(year)):
        for file in files:
            if file.endswith('.csv'):
                history.append(file.split('.')[0])
    return history

In [None]:
def download_AIS(year, zones):
    Path(str(year)).mkdir(parents=True, exist_ok=True)
    resume_download = check_dir(str(year))
    url = "https://coast.noaa.gov/htdata/CMSP/AISDataHandler/{0}/".format(year)
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    files = []
    for a in soup.find_all('a', href=True):
         if a.text and a.text.endswith('zip'):
            name, _ = a['href'].split('.')
            l = name.split('_')
            l.append(a.text)
            files.append(l) 

    df = pd.DataFrame(files)
    df.columns = [*df.columns[:-1], 'Files']
  
    for c in df.columns:
        if c == 'Files': continue
        unique_col = len(df[c].unique())
        if unique_col == 12 or unique_col == 6: # some years provid data only for 6 month
            df['Month'] = df[c]
        elif unique_col == 31:
            df['Days'] = df[c]
        elif 20 >= unique_col >= 18 or 'zone' in df[c][0].lower():
            if 'zone' in df[c][0].lower():
                df['Zone'] = pd.to_numeric([z[z.lower().find('zone')+4:]for z in df[c]])
            else:
                df['Zone'] = pd.to_numeric(df[c])
        del df[c]
    
    #  download
    if 'Zone' in df.columns:
        dl = df[df['Zone'].isin(zones)][['Files','Zone']]
    else:
        dl = df[['Files','Month']]
    for file, zone in tqdm(dl.values.tolist()):
        output = '%s_%s' % (year, str(zone))
        Path(str(year)).joinpath(output).mkdir(parents=True, exist_ok=True)
        if file.split('.')[0] in resume_download: continue
        print(file)
        wget.download(os.path.join(url,file)) 
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(str(year), output))
        os.remove(file)   

In [None]:
def normalize_dates(date): 
    date = date.replace(second=0)
    if date.minute > 45:
        return date + timedelta(hours=0, minutes=(60-date.minute))  
    else:
        return date - timedelta(hours=0, minutes=date.minute)  
   
    
    
def subset_AIS_to_CSV(year, min_time_interval = 60):
    data_list = []
    for path, dirs, files in os.walk(str(year)):
        for file in files:
                if file.endswith('.csv'):
                    x = os.path.join(path, file)
                    print(x)
                    df = pd.read_csv(x)
                    df = df.drop(['MMSI', 'VesselName', 'CallSign', 'Cargo', 'TranscieverClass'], axis=1, errors='ignore')
                    df = df.dropna()
                    df = df.query('(Status == "under way using engine" or Status == "under way sailing" or  Status == 8 or  Status == 0) & (VesselType == 1016 or 79 >= VesselType >= 70) & SOG > 3 & Length > 3 & Width > 3 & Draft > 3 ')
                    
                    # parse and normalize utc time 
                    df['BaseDateTime'] = pd.to_datetime(df.BaseDateTime, format='%Y-%m-%dT%H:%M:%S').apply(normalize_dates) # 2017-01-01T01:30:10
                    df.index = df.BaseDateTime
                    df = df.resample("%dT" % min_time_interval).last()
                    data_list.extend(df.values)
             
    df = pd.DataFrame(data_list, columns=['BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading',  'IMO', 'VesselType', 'Status', 'Length','Width', 'Draft'])
    df = df.dropna()
    df = df[df['BaseDateTime'] >= datetime(2016,3,1,3)]
    df.to_csv(os.path.join(str(year), '%s_AIS.csv' % str(year)))
    return df

In [None]:
def show_plotly(dataframe):
    import plotly.express as px
    fig = px.scatter_mapbox(dataframe,
                        lat=dataframe.LAT,
                        lon=dataframe.LON, color='SOG', mapbox_style="stamen-toner")
    fig.update_geos(
        lataxis_range=[dataframe.LAT.min(),dataframe.LAT.max()], lonaxis_range=[dataframe.LON.min(), dataframe.LON.max()]
    )
    fig.show()

In [None]:
download_AIS(2017, zones=list(range(4,10))) # zone 4 -> 9 

In [None]:
subset_AIS_to_CSV(year='2017', min_time_interval = 60) # subsampling and convert downloaded data into one csv   

In [None]:
year = 2017
df = pd.read_csv(Path(str(year),'%s_AIS.csv' % year))
show_plotly(df)


In [None]:
show_plotly(df[(df['LON'] < -110) & (df['LAT'] < 70)]) # North Pacific