In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from shapely.geometry import Point
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras import regularizers
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten
from tensorflow.keras import layers

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
path = 'data/preprocessed'
file_name = 'combined_only'
df = pd.read_csv(f'../{path}/{file_name}.csv')
df.head(3)

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon
0,1252340000000.0,1325376000.0,0.0,0.0,0.0,153.0,52.458649,4.5812
1,1252340000000.0,1325378000.0,0.0,0.0,0.0,153.0,52.458668,4.581167
2,1252340000000.0,1325379000.0,0.0,0.0,0.0,153.0,52.458633,4.581183


In [3]:
df['mmsi'].value_counts().count(), df.shape

(354, (28581398, 8))

## Keep small sample

In [4]:
sample_size = 1000000 

start_index = 0  

# Create a smaller sequential subset
data = df.iloc[-sample_size:]

data.shape

(1000000, 8)

# Recurrent Neural Network setup

## Data prep

In [5]:
number_vessel=data['mmsi'].value_counts().count()
print('Total number of vessels in our small dataset: ', number_vessel)

Total number of vessels in our small dataset:  21


### Split data by vessels

In [6]:
data.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon
27581398,133747100000000.0,1457563000.0,11661.617188,38469.824219,8.0,333.200012,43.2813,16.269434
27581399,133747100000000.0,1457564000.0,11401.474609,38274.378906,8.6,285.5,43.29398,16.247747
27581400,133747100000000.0,1457564000.0,12648.800781,39407.152344,8.5,314.200012,43.295353,16.228584
27581401,133747100000000.0,1457565000.0,9486.599609,37535.726562,8.4,321.5,43.317936,16.205826
27581402,133747100000000.0,1457565000.0,8944.052734,37482.410156,8.4,320.700012,43.323536,16.198494


In [7]:
X = data.drop(columns=['timestamp', 'lat', 'lon'])
y = data[['mmsi', 'lat', 'lon']]

In [8]:
y.head(2)

Unnamed: 0,mmsi,lat,lon
27581398,133747100000000.0,43.2813,16.269434
27581399,133747100000000.0,43.29398,16.247747


In [9]:
grouped_X = X.groupby('mmsi')

# List to store the NumPy arrays for each group
X_group_arrays = []

# Iterate through each group and store the data as a NumPy array
for mmsi_value, group_df in grouped_X:
    # 'group_df' contains the subset of data for the current 'mmsi' group
    # Convert the relevant columns to a NumPy array and append it to the list
    X_group_array = group_df.values
    X_group_arrays.append(X_group_array)
    
assert(len(X_group_arrays) == number_vessel)

In [10]:
grouped_y = y.groupby('mmsi')

# List to store the NumPy arrays for each group
y_group_arrays = []

# Iterate through each group and store the data as a NumPy array
for mmsi_value, group_df in grouped_y:
    # 'group_df' contains the subset of data for the current 'mmsi' group
    # Convert the relevant columns to a NumPy array and append it to the list
    y_group_array = group_df.drop('mmsi', axis=1).values
    y_group_arrays.append(y_group_arrays)

assert(len(y_group_arrays) == number_vessel)

### Padding

In [12]:
X_pad = pad_sequences(X_group_arrays, dtype='float32', padding='post', value=-1000)
print('X_train_pad shape: ',X_pad.shape)

X_train_pad shape:  (21, 227003, 5)


In [13]:
pd.DataFrame(X_pad[0])

Unnamed: 0,0,1,2,3,4
0,1.337472e+14,11661.617188,38469.824219,8.0,333.200012
1,1.337472e+14,11401.474609,38274.378906,8.6,285.500000
2,1.337472e+14,12648.800781,39407.152344,8.5,314.200012
3,1.337472e+14,9486.599609,37535.726562,8.4,321.500000
4,1.337472e+14,8944.052734,37482.410156,8.4,320.700012
...,...,...,...,...,...
226998,-1.000000e+03,-1000.000000,-1000.000000,-1000.0,-1000.000000
226999,-1.000000e+03,-1000.000000,-1000.000000,-1000.0,-1000.000000
227000,-1.000000e+03,-1000.000000,-1000.000000,-1000.0,-1000.000000
227001,-1.000000e+03,-1000.000000,-1000.000000,-1000.0,-1000.000000


### Split train / test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_group_arrays, test_size=0.3, shuffle=False)
print(X_train.shape)
print(X_test.shape)

(14, 227003, 5)
(7, 227003, 5)


In [25]:
# Create an RNN model
model = Sequential()
model.add(layers.Masking(mask_value=-1000, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(layers.LSTM(64, activation='relu'))
model.add(layers.Dense(2))  # Two outputs: latitude and longitude

In [26]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 227003, 5)         0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                17920     
                                                                 
 dense_5 (Dense)             (None, 2)                 130       
                                                                 
Total params: 18,050
Trainable params: 18,050
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=3)

# Evaluate the model
loss = model.evaluate(X_test, y_test)

# Make predictions
predictions = model.predict(X_test)

RecursionError: maximum recursion depth exceeded in __instancecheck__