In [1]:
# Parameters (papermill will inject these)
augmentation_type = "default"  # Default value, overridden by papermill
seed = -1      # Default value, overridden by papermill
num_aug = -1  # Default value, overridden by papermill

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
import random
import os

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Dense, Input, Dropout, Bidirectional
from keras.regularizers import l2
from keras.utils import to_categorical
from keras.optimizers import Adam

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

import mlflow
import mlflow.keras

from io import BytesIO, StringIO

In [4]:
comparison_seed = 3141

os.environ['TF_DETERMINISTIC_OPS'] = '1'

random.seed(comparison_seed)
np.random.seed(comparison_seed)
tf.random.set_seed(comparison_seed)

In [5]:
def summarize_dataframe(df):
    display(df)
    
    unique_vessel_groups = df['VesselGroup'].unique()
    num_vessel_groups = df['VesselGroup'].nunique()
    
    num_rows = f"{len(df):,}"
    
    print(f"The dataframe contains {num_rows} rows.")
    print()
    print(f"There are {num_vessel_groups} unique vessel groups.")
    print(f"The unique vessel groups are: {unique_vessel_groups}")
    
    print("\nRow counts for each vessel group:")
    for group in unique_vessel_groups:
        group_count = len(df[df['VesselGroup'] == group])
        print(f"Group {group}: {group_count:,} rows")

In [7]:
window_size = 40
step_size = 1 


# Model Parameters
batch_size = 64
epochs = 50
lstm_units = 64
dropout_rate = 0.2

In [8]:
if augmentation_type == "baseline":
    run_name = f"baseline_{seed}"
else:
    run_name = f"{augmentation_type}_aug{num_aug}_{seed}"

# Load Prep Data

In [None]:
if augmentation_type == "baseline" and num_aug == 0:
    file_path = f"/Users/fabian/Downloads/MasterDegree/df_train_{seed}.csv"
    print(file_path)
else:
    folder_name = augmentation_type.replace(" ", "_")
    
    if augmentation_type == "GNI":
        file_prefix = "GNI"
    elif augmentation_type == "vae":
        file_prefix = "vae"
    elif augmentation_type == "kmeans":
        file_prefix = "kmeans"

    file_path = f"/Users/fabian/Downloads/MasterDegree/{file_prefix}/{file_prefix}{seed}/df_{file_prefix}_{num_aug}.csv"
    print(file_path)

df_train = pd.read_csv(file_path)
df_test = pd.read_csv(f"/Users/fabian/Downloads/MasterDegree/df_test_{seed}.csv")
df_val = pd.read_csv(f"/Users/fabian/Downloads/MasterDegree/df_val_{seed}.csv")

In [None]:
print(f"\n===== Train Data =====")
summarize_dataframe(df_train)

print(f"\n===== Test Data =====")
summarize_dataframe(df_test)

print(f"\n===== Validation Data =====")
summarize_dataframe(df_val)

In [None]:
train_size = len(df_train)
test_size = len(df_test)
val_size = len(df_val)

rows_sum = train_size + test_size + val_size

print(f"The DataFrame (without augmentation) contains {rows_sum} rows.")

In [None]:
print(df_train['MMSI'].dtype)

In [None]:
print(set(df_train['MMSI']).intersection(df_test['MMSI']))
print(set(df_train['MMSI']).intersection(df_val['MMSI']))
print(set(df_val['MMSI']).intersection(df_test['MMSI']))

# Training

In [14]:
def create_sliding_windows(data, labels, window_size, step_size):
    """
    Create sliding windows from data and labels.

    Parameters:
        data (np.array): The feature array of shape (num_samples, num_features).
        labels (np.array): The label array (one-hot encoded) of shape (num_samples, num_classes).
        window_size (int): The size of each sliding window.
        step_size (int): The step size for the sliding window.

    Returns:
        np.array: The sliding window feature array of shape (num_windows, window_size, num_features).
        np.array: The sliding window label array of shape (num_windows, num_classes).
    """
    if len(data) != len(labels):
        raise ValueError("Data and labels must have the same length.")

    X, y = [], []
    for i in range(0, len(data) - window_size, step_size):
        X.append(data[i:i + window_size])
        y.append(labels[i + window_size - 1])  

    return np.array(X), np.array(y)

In [None]:
df_train.to_csv("df_class.csv", index=False)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Final VG Pred Baseline")

In [16]:
feature_columns = ['LAT', 'LON', 'SOG', 'COG']
target_column = 'VesselGroup'

scaler = MinMaxScaler()
label_encoder = LabelEncoder()

In [None]:
df_train['Dataset'] = 'train'
df_val['Dataset'] = 'val'
df_test['Dataset'] = 'test'

df_combined = pd.concat([df_train, df_val, df_test], ignore_index=True)

scaler = MinMaxScaler()
df_combined[feature_columns] = scaler.fit_transform(df_combined[feature_columns])
display(df_combined)

label_encoder = LabelEncoder()
df_combined['VesselGroup_encoded'] = label_encoder.fit_transform(df_combined[target_column])

df_train_scaled = df_combined[df_combined['Dataset'] == 'train'].drop(columns=['Dataset'])
df_val_scaled = df_combined[df_combined['Dataset'] == 'val'].drop(columns=['Dataset'])
df_test_scaled = df_combined[df_combined['Dataset'] == 'test'].drop(columns=['Dataset'])

train_features = df_train_scaled[feature_columns ].values
train_labels = df_train_scaled['VesselGroup_encoded'].values

val_features = df_val_scaled[feature_columns ].values
val_labels = df_val_scaled['VesselGroup_encoded'].values

test_features = df_test_scaled[feature_columns].values
test_labels = df_test_scaled['VesselGroup_encoded'].values

print("Classes in LabelEncoder:", label_encoder.classes_)
print("Training features shape:", train_features.shape)
print("Training labels shape:", train_labels.shape)
print("Validation features shape:", val_features.shape)
print("Validation labels shape:", val_labels.shape)
print("Test features shape:", test_features.shape)
print("Test labels shape:", test_labels.shape)

In [None]:
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
print(class_names)

y_train = to_categorical(train_labels, num_classes=num_classes)
y_val = to_categorical(val_labels, num_classes=num_classes)
y_test = to_categorical(test_labels, num_classes=num_classes)

print("y_train shape after one-hot encoding:", y_train.shape)
print("y_val shape after one-hot encoding:", y_val.shape)
print("y_test shape after one-hot encoding:", y_test.shape)

print("Unique vectors in y_train:", np.unique(y_train, axis=0))
print("y_train shape after one-hot encoding:", y_train.shape)

In [None]:
X_train, y_train = create_sliding_windows(train_features, y_train, window_size, step_size)
X_val, y_val = create_sliding_windows(val_features, y_val, window_size, step_size)
X_test, y_test = create_sliding_windows(test_features, y_test, window_size, step_size)

print("Sliding window shapes:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
 

In [None]:
y_train_labels = np.argmax(y_train, axis=1)
unique, counts = np.unique(y_train_labels, return_counts=True)
print("Class distribution in y_train:", dict(zip(unique, counts)))

y_val_labels = np.argmax(y_val, axis=1)
unique, counts = np.unique(y_val_labels, return_counts=True)
print("Class distribution in y_val:", dict(zip(unique, counts)))

y_test_labels = np.argmax(y_test, axis=1)
unique, counts = np.unique(y_test_labels, return_counts=True)
print("Class distribution in y_test:", dict(zip(unique, counts)))

In [None]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.argmax(y_train, axis=1)),
    y=np.argmax(y_train, axis=1)
)

class_weights = {i: weight for i, weight in enumerate(class_weights)}
print(f"Class weights for baseline model: {class_weights}")

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2])
print(num_classes)

In [None]:
model = Sequential()
model.add(Input(shape=input_shape))
model.add(LSTM(lstm_units, return_sequences=False, kernel_regularizer=l2(0.01)))
model.add(Dropout(dropout_rate))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(dropout_rate)) 
model.add(Dense(num_classes, activation='softmax'))

optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [24]:
model_architecture = model.to_json()
file_path = "model_architecture.json"

with open("model_architecture.json", "w") as f:
    f.write(model_architecture)

In [None]:
# Start MLflow experiment run
with mlflow.start_run(run_name = run_name) as run:
    # Log artifacts and model parameters to MLflow
    mlflow.log_param("epochs", epochs)
    mlflow.log_param("window_size", window_size)
    mlflow.log_param("step size", step_size)
    mlflow.log_param("lstm_units", lstm_units)
    mlflow.log_param("num_classes", num_classes)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("dropout_rate", dropout_rate)
    mlflow.log_param("train_size", train_size)
    mlflow.log_param("test_size", test_size)
    mlflow.log_param("val_size", val_size)
    mlflow.log_param("rows_sum", rows_sum)
    mlflow.log_param("num_aug", num_aug)
    mlflow.log_param("aug_type", augmentation_type)
    mlflow.log_param("seed", seed)

    mlflow.log_artifact(file_path)
    mlflow.log_artifact("df_class.csv")

    mlflow.keras.log_model(model, "class_lstm_baseline")
    
    history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
    )

    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Model - Test Accuracy: {test_accuracy * 100:.2f}%")

    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("test_loss", test_loss)
    

    for epoch, acc in enumerate(history.history['accuracy']):
        mlflow.log_metric("training_accuracy", acc, step=epoch)
    for epoch, val_acc in enumerate(history.history['val_accuracy']):
        mlflow.log_metric("validation_accuracy", val_acc, step=epoch)
    for epoch, loss in enumerate(history.history['loss']):
        mlflow.log_metric("training_loss", loss, step=epoch)
    for epoch, val_loss in enumerate(history.history['val_loss']):
        mlflow.log_metric("validation_loss", val_loss, step=epoch)

print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

os.remove("df_class.csv")
os.remove("model_architecture.json")

In [None]:
y_pred_prob = model.predict(X_test)

y_pred_labels = y_pred_prob.argmax(axis=1)

y_test_labels = y_test.argmax(axis=1)

f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')
precision = precision_score(y_test_labels, y_pred_labels, average='weighted')
recall = recall_score(y_test_labels, y_pred_labels, average='weighted')

print(f'F1 Score (Weighted): {f1:.4f}')
print(f'Precision (Weighted): {precision:.4f}')
print(f'Recall (Weighted): {recall:.4f}')

report = classification_report(y_test_labels, y_pred_labels, target_names=class_names, output_dict=True)
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels, target_names=class_names))

for class_name in class_names:
    f1_score_key = f"test_f1_{class_name}"

conf_matrix = confusion_matrix(y_test_labels, y_pred_labels)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')

plt.savefig("confusion_matrix.png")
plt.show()

for i in range(10):
    true_label = class_names[y_test_labels[i]]
    predicted_label = class_names[y_pred_labels[i]]
    print(f"Sample {i}: True Class = {true_label}, Predicted Class = {predicted_label}")

In [None]:
y_pred_labels = y_pred_prob.argmax(axis=1)

unique_classes = np.unique(y_pred_labels)
num_unique_classes = len(unique_classes)

print(f"Unique Predicted Classes: {unique_classes}")
print(f"Number of Unique Predicted Classes: {num_unique_classes}")

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_acc = history.history.get('accuracy')
val_acc = history.history.get('val_accuracy')

plt.figure(figsize=(12, 6))
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

if train_acc and val_acc:
    plt.figure(figsize=(12, 6))
    plt.plot(train_acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()