In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from datetime import datetime

In [2]:
df_all_vessels_berth_time = pd.read_csv('./data/ais_with_vessel_berth/all_vessels_dwell_time.csv')
df_all_vessels_berth_time['time_seen'] = pd.to_datetime(df_all_vessels_berth_time['time_seen'])
df_all_vessels_berth_time['berth_time'] = pd.to_datetime(df_all_vessels_berth_time['berth_time'])
df_all_vessels_berth_time.shape

(3381018, 25)

In [3]:
# creat the target vessels df. We need to loop through this df to create new features with reference to other vessels when this vessel enters the port area
df_target_vessels = df_all_vessels_berth_time[df_all_vessels_berth_time['berth_time'].notnull()]
df_target_vessels.shape

(1802, 25)

#### Feature 1: Determine the average dwell time for target terminal


In [4]:
# Determine the average dwell time for target terminal
df_target_vessels['avg_dwell_at_target_terminal'] = np.nan

day_delta = 14
for index, row in df_target_vessels.iterrows():
    # reduce 1 min to make sure the current vessel won't be included in the df_in_range 
    upper_bound = row.time_seen + pd.Timedelta(-1, 'min')
    lower_bound = upper_bound + pd.Timedelta(-day_delta, 'day')
    #print(f'Vessel Name: {row.vessel_name}, Time Seen: {row.time_seen}, Target Terminal: {row.target_terminal}, +Days: {upper_bound}, -Days: {lower_bound}')
    df_in_range = df_target_vessels[(df_target_vessels['berth_time'] >= lower_bound) & (df_target_vessels['berth_time'] <= upper_bound) & (df_target_vessels['target_terminal'] == row.target_terminal)]
    #df_in_range = df_target_vessels[(df_target_vessels['berth_time'] >= lower_bound) & (df_target_vessels['berth_time'] <= upper_bound)]
    if not df_in_range.empty:
        avg_dwell_at_target_terminal = round(df_in_range['dwell_in_hr'].mean(), 1)
        #print(avg_dwell_at_target_terminal)
        df_target_vessels.at[index, 'avg_dwell_at_target_terminal'] = avg_dwell_at_target_terminal
    #break 
    
    

In [5]:
#df_in_range['target_terminal']
df_target_vessels.head(3)

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,...,transceiver_class,date_seen,is_entering_port,at_terminal,terminal_name,is_berthing,target_terminal,berth_time,dwell_in_hr,avg_dwell_at_target_terminal
3235,565475000.0,2020-02-04 10:15:50,32.30734,-117.68464,13.7,340.6,342.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,A,2020-02-04,True,False,,False,PierT,2020-02-07 14:00:04,75.7,25.5
3457,565475000.0,2020-03-22 10:16:06,32.04719,-117.56927,12.1,332.3,333.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,A,2020-03-22,True,False,,False,PierT,2020-03-24 15:30:26,53.2,46.4
18129,366562000.0,2020-02-05 00:23:12,33.63327,-120.81115,18.0,90.2,87.0,MANOA,IMO7907984,KDBG,...,A,2020-02-05,True,False,,False,PierA,2020-02-05 12:01:50,11.6,14.0


In [6]:
df_target_vessels['avg_dwell_at_target_terminal'].isnull().sum()

23

#### Feature 2: Determine the number of vessels berthed at the target terminal

In [7]:
# Determine the number of vessels berthed at the target terminal

df_target_vessels['num_of_vessel_at_target_terminal'] = pd.Series(dtype='int')
hr_delta = 1

for index, row in df_target_vessels.iterrows():
    # reduce 1 min to make sure the current vessel won't be included in the df_in_range 
    upper_bound = row['time_seen'] + pd.Timedelta(-1, 'min')
    lower_bound = upper_bound + pd.Timedelta(-hr_delta, 'hour')
    #print(f'Vessel Name: {row.vessel_name}, Time Seen: {row.time_seen}, Target Terminal: {row.target_terminal}, +Hours: {upper_bound}, -Hours: {lower_bound}')

    df_in_range = df_all_vessels_berth_time[
        (df_all_vessels_berth_time['at_terminal'] == True) & (df_all_vessels_berth_time['terminal_name'] == row['target_terminal']) & 
        (df_all_vessels_berth_time['time_seen'] >= lower_bound) & (df_all_vessels_berth_time['time_seen'] < upper_bound) 
    ]
    
    #print(f'Vessel Name: {df_in_range.vessel_name}, Time Seen: {df_in_range.time_seen}, Target Termianl: {df_in_range.target_terminal}')
    #print(f'Vessel Name: {row.vessel_name}, Number of Vessels at target terminal: {len(df_in_range.imo.unique().tolist())}')
    num_of_vessel_at_target_terminal = len(df_in_range['imo'].unique().tolist())
    df_target_vessels.at[index, 'num_of_vessel_at_target_terminal'] = num_of_vessel_at_target_terminal
    
    #break

In [8]:
df_target_vessels.head(3)

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,...,date_seen,is_entering_port,at_terminal,terminal_name,is_berthing,target_terminal,berth_time,dwell_in_hr,avg_dwell_at_target_terminal,num_of_vessel_at_target_terminal
3235,565475000.0,2020-02-04 10:15:50,32.30734,-117.68464,13.7,340.6,342.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,2020-02-04,True,False,,False,PierT,2020-02-07 14:00:04,75.7,25.5,1.0
3457,565475000.0,2020-03-22 10:16:06,32.04719,-117.56927,12.1,332.3,333.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,2020-03-22,True,False,,False,PierT,2020-03-24 15:30:26,53.2,46.4,2.0
18129,366562000.0,2020-02-05 00:23:12,33.63327,-120.81115,18.0,90.2,87.0,MANOA,IMO7907984,KDBG,...,2020-02-05,True,False,,False,PierA,2020-02-05 12:01:50,11.6,14.0,0.0


In [9]:
df_target_vessels['num_of_vessel_at_target_terminal'].isnull().sum()

0

#### Feature 3: Determine the number of vessels in the port area

In [10]:
# Determine the number of vessels in the part area

df_target_vessels['num_of_vessel_in_port'] = pd.Series(dtype='int')
hr_delta = 1

for index, row in df_target_vessels.iterrows():
    # reduce 1 min to make sure the current vessel won't be included in the df_in_range 
    upper_bound = row['time_seen'] + pd.Timedelta(-1, 'min')
    lower_bound = upper_bound + pd.Timedelta(-hr_delta, 'hour')
    #print(f'Vessel Name: {row.vessel_name}, Time Seen: {row.time_seen}, Target Terminal: {row.target_terminal}, +Hours: {upper_bound}, -Hours: {lower_bound}')

    df_in_range = df_all_vessels_berth_time[
        (df_all_vessels_berth_time['time_seen'] >= lower_bound) & (df_all_vessels_berth_time['time_seen'] <= upper_bound) 
    ]
    
    #print(f'Vessel Name: {df_in_range.vessel_name}, Time Seen: {df_in_range.time_seen}, Target Termianl: {df_in_range.target_terminal}')
    #print(f'Vessel Name: {row.vessel_name}, Number of Vessels in Port: {len(df_in_range.imo.unique().tolist())}')
    num_of_vessel_in_port = len(df_in_range['imo'].unique().tolist())
    df_target_vessels.at[index, 'num_of_vessel_in_port'] = num_of_vessel_in_port
    
    #break

In [11]:
df_target_vessels.head(3)

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,...,is_entering_port,at_terminal,terminal_name,is_berthing,target_terminal,berth_time,dwell_in_hr,avg_dwell_at_target_terminal,num_of_vessel_at_target_terminal,num_of_vessel_in_port
3235,565475000.0,2020-02-04 10:15:50,32.30734,-117.68464,13.7,340.6,342.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,True,False,,False,PierT,2020-02-07 14:00:04,75.7,25.5,1.0,50.0
3457,565475000.0,2020-03-22 10:16:06,32.04719,-117.56927,12.1,332.3,333.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,True,False,,False,PierT,2020-03-24 15:30:26,53.2,46.4,2.0,53.0
18129,366562000.0,2020-02-05 00:23:12,33.63327,-120.81115,18.0,90.2,87.0,MANOA,IMO7907984,KDBG,...,True,False,,False,PierA,2020-02-05 12:01:50,11.6,14.0,0.0,57.0


In [12]:
df_target_vessels['num_of_vessel_in_port'].isnull().sum()

0

#### Feature 4: Day of Week

In [13]:
# The dayofweek of pandas series denotes Monday by 0 
df_target_vessels['weekday'] = df_target_vessels['time_seen'].dt.dayofweek

In [14]:
df_target_vessels.head(2)

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,...,at_terminal,terminal_name,is_berthing,target_terminal,berth_time,dwell_in_hr,avg_dwell_at_target_terminal,num_of_vessel_at_target_terminal,num_of_vessel_in_port,weekday
3235,565475000.0,2020-02-04 10:15:50,32.30734,-117.68464,13.7,340.6,342.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,False,,False,PierT,2020-02-07 14:00:04,75.7,25.5,1.0,50.0,1
3457,565475000.0,2020-03-22 10:16:06,32.04719,-117.56927,12.1,332.3,333.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,False,,False,PierT,2020-03-24 15:30:26,53.2,46.4,2.0,53.0,6


#### Feature 5: Time of Day (hour)

In [15]:
df_target_vessels['hour_of_day'] = df_target_vessels['time_seen'].dt.hour

In [16]:
df_target_vessels.head(2)

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,...,terminal_name,is_berthing,target_terminal,berth_time,dwell_in_hr,avg_dwell_at_target_terminal,num_of_vessel_at_target_terminal,num_of_vessel_in_port,weekday,hour_of_day
3235,565475000.0,2020-02-04 10:15:50,32.30734,-117.68464,13.7,340.6,342.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,,False,PierT,2020-02-07 14:00:04,75.7,25.5,1.0,50.0,1,10
3457,565475000.0,2020-03-22 10:16:06,32.04719,-117.56927,12.1,332.3,333.0,MAERSK NEWHAVEN,IMO9215880,9VPY4,...,,False,PierT,2020-03-24 15:30:26,53.2,46.4,2.0,53.0,6,10


#### Feature 5: Is Public Holiday 

In [17]:
df_holiday = pd.read_csv('./data/us_holiday/us_holiday.csv')
df_holiday = df_holiday[df_holiday['isPaidTimeOff'] == True]

In [18]:
holiday_list = df_holiday['date'].tolist()

In [19]:
def is_holiday(x, holidays):
    if x.strftime("%Y-%m-%d") in holidays:
        return True
    else:
        return False

In [20]:
df_target_vessels['is_holiday']= df_target_vessels['time_seen'].apply(lambda x: is_holiday(x, holiday_list))

In [21]:
df_target_vessels[df_target_vessels['is_holiday'] == True ][:2]

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,...,is_berthing,target_terminal,berth_time,dwell_in_hr,avg_dwell_at_target_terminal,num_of_vessel_at_target_terminal,num_of_vessel_in_port,weekday,hour_of_day,is_holiday
25606,367641230.0,2020-07-04 19:21:37,32.22483,-120.049,21.5,81.0,81.0,MARJORIE C,IMO9619684,WDH6745,...,False,PierA,2020-07-06 21:00:24,49.6,20.8,1.0,42.0,5,19,True
175221,310768000.0,2020-07-03 21:06:55,34.36858,-120.71844,9.7,109.5,106.0,DUESSELDORF EXPRESS,IMO9143556,ZCEV4,...,False,PierA,2020-07-04 14:00:08,16.9,25.6,1.0,36.0,4,21,True


#### Clean Up and Output

In [22]:
df_target_vessels.columns.values

array(['mmsi', 'time_seen', 'lat', 'lon', 'sog', 'cog', 'heading',
       'vessel_name', 'imo', 'call_sign', 'vessel_type', 'status',
       'length', 'width', 'draft', 'cargo', 'transceiver_class',
       'date_seen', 'is_entering_port', 'at_terminal', 'terminal_name',
       'is_berthing', 'target_terminal', 'berth_time', 'dwell_in_hr',
       'avg_dwell_at_target_terminal', 'num_of_vessel_at_target_terminal',
       'num_of_vessel_in_port', 'weekday', 'hour_of_day', 'is_holiday'],
      dtype=object)

In [23]:
required_col = [
    'imo', 'vessel_name', 'time_seen', #'length', 'width', 'draft',
    'target_terminal', 'dwell_in_hr',
    'avg_dwell_at_target_terminal',
    'num_of_vessel_at_target_terminal', 'num_of_vessel_in_port',
    'weekday', 'hour_of_day', 'is_holiday'
]

In [24]:
df_output = df_target_vessels[required_col]

In [25]:
df_output.isnull().sum()

imo                                  0
vessel_name                          0
time_seen                            0
target_terminal                      0
dwell_in_hr                          0
avg_dwell_at_target_terminal        23
num_of_vessel_at_target_terminal     0
num_of_vessel_in_port                0
weekday                              0
hour_of_day                          0
is_holiday                           0
dtype: int64

In [26]:
df_output.shape

(1802, 11)

In [27]:
# avg_dwell_at_target_terminal is a required field
# Decided to drop records if this value is null
df_output = df_output[df_output['avg_dwell_at_target_terminal'].notnull()]
df_output.shape

(1779, 11)

In [28]:
df_output.to_csv('./data/final_output/ais_final.csv', index=False)

Extra work for getting the vessel imo list 

In [29]:
df_imo_list = pd.DataFrame(df_output['imo'].unique(), columns=['imo'])

In [30]:
df_imo_list.to_csv('./data/final_output/imo_list.csv', index=False)

In [31]:
df_imo_list.shape

(384, 1)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b18b33d-3a56-4f49-ad6e-71ecea9f0183' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>