Load all filtered ais files, merge them into one, then resample 

In [1]:
import numpy as np
import pandas as pd
import os 

In [2]:
filtered_dir = './data/filtered_ais_2022/'
filtered_ais_file_list = []
for file in os.listdir(filtered_dir):
    if file.endswith('csv'):
        filtered_ais_file_list.append(file)

print(filtered_ais_file_list[:5])

['AIS_2020_01_01_filtered.csv', 'AIS_2020_01_02_filtered.csv', 'AIS_2020_01_03_filtered.csv', 'AIS_2020_01_04_filtered.csv', 'AIS_2020_01_05_filtered.csv']


In [3]:

df_list = []
for ais_file in filtered_ais_file_list:
    df_filtered_ais = pd.read_csv(os.path.join(filtered_dir, ais_file))
    df_list.append(df_filtered_ais)

df_all_filtered_ais = pd.concat(df_list)

In [4]:
df_all_filtered_ais.shape

(11713700, 18)

In [5]:
# Resample with 30min interval per each vessel
resampled_dir = './data/resampled_ais/'
vessels = df_all_filtered_ais['imo'].unique().tolist()

all_vessels = []
for vsl in vessels:
    df_vsl = df_all_filtered_ais[df_all_filtered_ais['imo'] == vsl]
    df_vsl['time_seen'] = pd.to_datetime(df_vsl['time_seen'])
    df_vsl = df_vsl.resample('30T', on='time_seen').first()
    df_vsl = df_vsl.reset_index(drop=True)
    df_vsl['time_seen'] = df_vsl['time_seen'].dt.strftime('%Y-%m-%d %H:%M:%S')
    all_vessels.append(df_vsl)

df_resampled_ais = pd.concat(all_vessels)
#df_resampled_ais = df_resampled_ais.dropna()
# During resampling, as we are taking every 30 min interval, chances are we may not have records in that period. 
# This will result with entries in our dataset with full row of NaN
# To overcome this, we will remove any records when IMO isnull. Afterall, IMO is our key which should not be null
df_resampled_ais  = df_resampled_ais[df_resampled_ais['imo'].notnull()]


# Found there is a column name spelling error for some of the 2021 data files. 
# In 2022 AIS data, there is a column named 'Transceiver Class'
# During the Filter AIS Data step, we have renamed it to 'transceiver_class'
# However, in the 2021 data set, the column was wrongly spelled as 'TranscieverClass'
# The logic in the Filter AIS Data step cannot cater this, thus this becomes an extra column in this df_resampled_ais dataframe
# (Only some of the 2021 files has this problem)
# In order to fix this, we will assign non-null values in this TranscieverClass column to the transceiver_class column, then drop the TranscieverClass column
if 'TranscieverClass' in df_resampled_ais.columns:
     df_resampled_ais.loc[df_resampled_ais['TranscieverClass'].notnull(), 'transceiver_class'] =  df_resampled_ais.loc[df_resampled_ais['TranscieverClass'].notnull(), 'TranscieverClass']
     # we also need to move the transceiver_clas column before lat_lon and remove the TranscieverClass column 
     df_resampled_ais =  df_resampled_ais[
          ['mmsi',
          'time_seen',
          'lat',
          'lon',
          'sog',
          'cog',
          'heading',
          'vessel_name',
          'imo',
          'call_sign',
          'vessel_type',
          'status',
          'length',
          'width',
          'draft',
          'cargo',
          'transceiver_class',
          'lat_lon'
          ]
     ]


print(df_resampled_ais.shape)
df_resampled_ais.to_csv(os.path.join(resampled_dir, 'resampled_ais_2020.csv'), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(929174, 18)


In [6]:
df_resampled_ais.head()

Unnamed: 0,mmsi,time_seen,lat,lon,sog,cog,heading,vessel_name,imo,call_sign,vessel_type,status,length,width,draft,cargo,transceiver_class,lat_lon
0,565807000.0,2020-01-01 00:00:00,32.31175,-117.53433,14.7,162.1,163.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"32.3118,-117.5343"
1,565807000.0,2020-01-01 00:31:18,32.18897,-117.48776,14.7,160.0,158.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"32.189,-117.4878"
2,565807000.0,2020-01-01 01:00:00,32.08009,-117.43979,14.5,161.7,163.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"32.0801,-117.4398"
3,565807000.0,2020-01-01 01:30:36,31.96277,-117.39213,14.6,163.0,165.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"31.9628,-117.3921"
4,565807000.0,2020-01-01 02:00:00,31.84855,-117.35075,14.8,160.2,161.0,NYK CLARA,IMO9355408,9VFW9,70.0,0.0,210.0,30.0,11.5,72.0,A,"31.8486,-117.3508"


In [7]:
df_resampled_ais.isnull().sum()

mmsi                      0
time_seen                 0
lat                       0
lon                       0
sog                       0
cog                       0
heading                   0
vessel_name               0
imo                       0
call_sign                 0
vessel_type               0
status                  819
length                34360
width                 76666
draft                 94982
cargo                479957
transceiver_class         0
lat_lon                   0
dtype: int64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b18b33d-3a56-4f49-ad6e-71ecea9f0183' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>