In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta, date

This notebook should be run with both 2021 and 2022 resampled AIS

In [3]:
resampled_dir = './data/resampled_ais/'
df_resampled_ais_list = []
for ais_file in os.listdir(resampled_dir):
    print(ais_file)
    if ais_file.endswith('csv'):
        df = pd.read_csv(os.path.join(resampled_dir, ais_file))
        df_resampled_ais_list.append(df)

df_resampled_ais = pd.concat(df_resampled_ais_list)
df_resampled_ais['time_seen'] = pd.to_datetime(df_resampled_ais['time_seen'])


obsolete
resampled_ais_2020.csv
resampled_ais_2021.csv
resampled_ais_2022.csv


In [4]:
len(df_resampled_ais_list)

3

In [5]:
df_resampled_ais.head()

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,vessel_type,status,length,width,draft,cargo,transceiver_class,lat_lon
0,565807000.0,2020-01-01 00:00:00,32.31175,-117.53433,14.7,162.1,163.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"32.3118,-117.5343"
1,565807000.0,2020-01-01 00:31:18,32.18897,-117.48776,14.7,160.0,158.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"32.189,-117.4878"
2,565807000.0,2020-01-01 01:00:00,32.08009,-117.43979,14.5,161.7,163.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"32.0801,-117.4398"
3,565807000.0,2020-01-01 01:30:36,31.96277,-117.39213,14.6,163.0,165.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"31.9628,-117.3921"
4,565807000.0,2020-01-01 02:00:00,31.84855,-117.35075,14.8,160.2,161.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"31.8486,-117.3508"


In [6]:
df_resampled_ais.shape

(3381018, 18)

In [7]:
vessels = df_resampled_ais['imo'].unique().tolist()
len(vessels)

3485

In [8]:
# min_date is the first date of the whole data frame under current scope 
# currently, it is 2022/1/1
def check_entering_port(row, min_date):
    if pd.isna(row['date_seen_before']):
        if row['date_seen'] > min_date:
            return True
    else:
        # the orginal setting is 1. But chances are AIS signal is missed or a vessel could go out of circle and come back
        # This is still considered as 1 entrance. So, we set the threshold to 14 days now. 
        #  [10/8/22] we will set the threshold back to 7 days. This shall reduce the chance for this kind of ‘outliers’ 
        #  with extreme long dwell time which are caused by wrong detection
        if (pd.to_datetime(row['date_seen']) - pd.to_datetime(row['date_seen_before'])) / np.timedelta64(1, 'D')> 7:
            return True
    return False

In [9]:
# This is the orignal implementation. However, it took 42m to run for 2022 6 months data
# The poor perforamce is due to the join (merge) in loop
# We'll use a new implementation for this 
##################################################
#
# all_vessels = []
#earliest_date = df_resampled_ais['time_seen'].dt.date.min()
#for vsl in vessels:
#    df_vsl = df_resampled_ais[df_resampled_ais['imo'] == vsl]
#    df_vsl['date_seen'] = df_vsl['time_seen'].dt.date
#
#    df_vsl_grouped = df_vsl.groupby(by='date_seen', as_index=False).first()
#    df_vsl_grouped['date_seen_before'] = df_vsl_grouped['date_seen'].shift(1)
#    df_vsl_grouped['is_entering_port'] = df_vsl_grouped.apply(lambda x: check_entering_port(x, earliest_date), axis=1)
#   
#    all_vessels.append(df_vsl.merge(df_vsl_grouped[['imo', 'time_seen', 'is_entering_port']], how='left', on=['imo', 'time_seen']))


#df_all_vessels = pd.concat(all_vessels)
# Since we have joined to the original df_vsl, there are records with is_entering_port equals to NaN
# We will change NaN to False 
#df_all_vessels['is_entering_port'] =df_all_vessels['is_entering_port'].apply(lambda x: False if pd.isna(x) else x)

In [10]:
all_vessels = []
earliest_date = df_resampled_ais['time_seen'].dt.date.min()
for vsl in vessels:
    df_vsl = df_resampled_ais[df_resampled_ais['imo'] == vsl]
    df_vsl['date_seen'] = df_vsl['time_seen'].dt.date

    df_vsl_grouped = df_vsl.groupby(by='date_seen', as_index=False).first()
    df_vsl_grouped['date_seen_before'] = df_vsl_grouped['date_seen'].shift(1)
    df_vsl_grouped['is_entering_port'] = df_vsl_grouped.apply(lambda x: check_entering_port(x, earliest_date), axis=1)
   
    #all_vessels.append(df_vsl.merge(df_vsl_grouped[['imo', 'time_seen', 'is_entering_port']], how='left', on=['imo', 'time_seen']))
    all_vessels.append(df_vsl_grouped)

df_all_grouped = pd.concat(all_vessels)

df_all_vessels = pd.merge(df_resampled_ais, df_all_grouped[['imo', 'time_seen', 'date_seen', 'is_entering_port']], how='left', left_on=['imo', 'time_seen'], right_on=['imo', 'time_seen'])
# Since we have joined to the original df_vsl, there are records with is_entering_port equals to NaN
# We will change NaN to False 
df_all_vessels['is_entering_port'] = df_all_vessels['is_entering_port'].apply(lambda x: False if pd.isna(x) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
#df_all_vessels.to_csv('./assets/test/ais_entering_port.csv', index=False)
df_all_vessels.to_csv('./data/ais_with_vessel_berth/ais_entering_port.csv', index=False)

In [12]:
df_resampled_ais.shape

(3381018, 18)

In [13]:
df_all_vessels.shape

(3381018, 20)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b18b33d-3a56-4f49-ad6e-71ecea9f0183' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>