In [1]:
# Parameters (papermill will inject these)
augmentation_type = "default"  # Default value, overridden by papermill
seed = -1      # Default value, overridden by papermill
num_aug = -1  # Default value, overridden by papermill

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import random
import os
import mlflow
import mlflow.keras

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

In [3]:
comparison_seed = 3141

os.environ['TF_DETERMINISTIC_OPS'] = '1'

random.seed(comparison_seed)
np.random.seed(comparison_seed)
tf.random.set_seed(comparison_seed)

In [4]:
def summarize_dataframe(df):
    display(df)
    
    unique_vessel_groups = df['VesselGroup'].unique()
    num_vessel_groups = df['VesselGroup'].nunique()
    
    num_rows = f"{len(df):,}"
    
    print(f"The dataframe contains {num_rows} rows.")
    print()
    print(f"There are {num_vessel_groups} unique vessel groups.")
    print(f"The unique vessel groups are: {unique_vessel_groups}")
    
    print("\nRow counts for each vessel group:")
    for group in unique_vessel_groups:
        group_count = len(df[df['VesselGroup'] == group])
        print(f"Group {group}: {group_count:,} rows")

In [5]:
# Fixed Variables after GridSearch
window_size = 50 
step_size = 1 


# Model Parameters
batch_size = 64
epochs = 50
lstm_units = 64

In [6]:
if augmentation_type == "baseline":
    run_name = f"baseline_{seed}"
else:
    run_name = f"{augmentation_type}_aug{num_aug}_{seed}"

# Load Data

In [None]:
if augmentation_type == "baseline" and num_aug == 0:
    # Path for no augmentation (baseline data without aug)
    file_path = f"/Users/fabian/Downloads/MasterDegree/df_train_{seed}.csv"
    print(file_path)
else:
    # Path for augmented data
    folder_name = augmentation_type.replace(" ", "_")
    
    if augmentation_type == "GNI":
        file_prefix = "GNI"
    elif augmentation_type == "vae":
        file_prefix = "vae"
    elif augmentation_type == "kmeans":
        file_prefix = "kmeans"

    file_path = f"/Users/fabian/Downloads/MasterDegree/{file_prefix}/{file_prefix}{seed}/df_{file_prefix}_{num_aug}.csv"
    print(file_path)

df_train = pd.read_csv(file_path)
df_test = pd.read_csv(f"/Users/fabian/Downloads/MasterDegree/df_test_{seed}.csv")
df_val = pd.read_csv(f"/Users/fabian/Downloads/MasterDegree/df_val_{seed}.csv")

In [None]:
print(f"\n===== Train Data =====")
summarize_dataframe(df_train)

print(f"\n===== Test Data =====")
summarize_dataframe(df_test)

print(f"\n===== Validation Data =====")
summarize_dataframe(df_val)

In [None]:
train_size = len(df_train)
test_size = len(df_test)
val_size = len(df_val)

rows_sum = train_size + test_size + val_size

print(f"The DataFrame (without augmentation) contains {rows_sum} rows.")

In [None]:
unique_vessel_groups = df_train['VesselGroup'].unique()
print(unique_vessel_groups)

# Preprocessing Data

In [None]:
df_train['Dataset'] = 'train'
df_val['Dataset'] = 'val'
df_test['Dataset'] = 'test'

combined_data = pd.concat([df_train, df_val, df_test], axis=0).reset_index(drop=True)

print("\nCombined dataset preview:")
print(combined_data[['LAT', 'LON', 'SOG', 'COG', 'Dataset']].head())

scaler_features = MinMaxScaler()
scaler_targets = MinMaxScaler()

scaler_features.fit(combined_data[['LAT', 'LON', 'SOG', 'COG']])
scaler_targets.fit(combined_data[['LAT', 'LON']])

print("\nScaler Features Min/Max:")
print("Min:", scaler_features.data_min_)
print("Max:", scaler_features.data_max_)

print("\nScaler Targets Min/Max:")
print("Min:", scaler_targets.data_min_)
print("Max:", scaler_targets.data_max_)

combined_data[['LAT', 'LON', 'SOG', 'COG']] = scaler_features.transform(combined_data[['LAT', 'LON', 'SOG', 'COG']])

scaled_train = combined_data[combined_data['Dataset'] == 'train'].drop(columns=['Dataset']).reset_index(drop=True)
scaled_val = combined_data[combined_data['Dataset'] == 'val'].drop(columns=['Dataset']).reset_index(drop=True)
scaled_test = combined_data[combined_data['Dataset'] == 'test'].drop(columns=['Dataset']).reset_index(drop=True)

print(f"\nDEBUG: Row Counts - Train: {len(scaled_train)}, Val: {len(scaled_val)}, Test: {len(scaled_test)}")
print("\nScaled Train Data Preview:")
print(scaled_train[['LAT', 'LON', 'SOG', 'COG']].head())
print("\nScaled Validation Data Preview:")
print(scaled_val[['LAT', 'LON', 'SOG', 'COG']].head())
print("\nScaled Test Data Preview:")
print(scaled_test[['LAT', 'LON', 'SOG', 'COG']].head())

## Functions

In [12]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371
    return c * r

In [13]:
def preprocess_combined_data(df, window_size, step_size):
    """
    Preprocess data for the combined dataset while maintaining MMSI-based sequences.

    Parameters:
        df (DataFrame): The scaled DataFrame containing the data.
        window_size (int): The size of the sliding window.
        step_size (int): The step size for the sliding window.

    Returns:
        X (np.ndarray): Feature sequences of shape (num_samples, window_size, num_features).
        y (np.ndarray): Target values of shape (num_samples, num_targets).
    """
    # Ensure the dataset is sorted by MMSI and time to maintain sequences
    df = df.sort_values(['MMSI', 'BaseDateTime']).reset_index(drop=True)

    features = df[['LAT', 'LON', 'SOG', 'COG']].values
    targets = df[['LAT', 'LON']].values

    X, y = [], []
    for mmsi, group in df.groupby('MMSI'):
        data = features[group.index]
        target = targets[group.index]

        for i in range(0, len(data) - window_size, step_size):
            X.append(data[i:i + window_size])
            y.append(target[i + window_size])

    return np.array(X), np.array(y)

In [14]:
def rescale_predictions(y_test, y_pred, scaler_targets):
    """
    Rescale predictions and actual values back to their original range.

    Parameters:
        y_test (np.ndarray): Actual scaled test target values (LAT, LON).
        y_pred (np.ndarray): Predicted scaled test target values (LAT, LON).
        scaler_targets (MinMaxScaler): Scaler used for inverse transformation.

    Returns:
        original_actuals, original_predictions (np.ndarray, np.ndarray): Rescaled actuals and predictions.
    """
    y_test = y_test.reshape(-1, 2)
    y_pred = y_pred.reshape(-1, 2)

    original_actuals = scaler_targets.inverse_transform(y_test)
    original_predictions = scaler_targets.inverse_transform(y_pred)

    print("\nDEBUG: Rescaled Values Validation")
    for i in range(min(5, len(original_actuals))):
        print(f"Row {i}:")
        print(f"  Scaled Actual: {y_test[i]}")
        print(f"  Rescaled Actual: {original_actuals[i]}")
        print(f"  Scaled Predicted: {y_pred[i]}")
        print(f"  Rescaled Predicted: {original_predictions[i]}")

    return original_actuals, original_predictions

In [15]:
def generate_comparison_df(y_test, y_pred, test_data, scaler_targets):
    """
    Generates a detailed comparison DataFrame.

    Parameters:
        y_test (np.ndarray): Actual scaled test target values (LAT, LON).
        y_pred (np.ndarray): Predicted scaled test target values (LAT, LON).
        test_data (DataFrame): Original test data containing metadata (LAT, LON, etc.).
        scaler_targets (MinMaxScaler): Pre-fitted scaler used for inverse transformation.

    Returns:
        DataFrame: A detailed comparison DataFrame with Haversine distances.
    """
    y_test_flattened = y_test.reshape(-1, 2)
    y_pred_flattened = y_pred.reshape(-1, 2)

    original_actuals = scaler_targets.inverse_transform(y_test_flattened)
    original_predictions = scaler_targets.inverse_transform(y_pred_flattened)

    print("\nDEBUG: Rescaled Values Comparison")
    print("Original LAT/LON from test_data:")
    print(test_data[['LAT', 'LON']].iloc[:5].values) 
    print("Rescaled Actuals (Original Range):")
    print(original_actuals[:5]) 
    print("Rescaled Predictions (Original Range):")
    print(original_predictions[:5])

    metadata = test_data.iloc[:len(y_pred_flattened)].reset_index(drop=True)

    if len(metadata) != len(original_predictions):
        print(
            f"\nWARNING: Metadata length ({len(metadata)}) does not match predictions ({len(original_predictions)})"
        )

    comparison_df = pd.DataFrame({
        'BaseDateTime': metadata['BaseDateTime'],
        'MMSI': metadata['MMSI'],
        'VesselGroup': metadata['VesselGroup'],
        'Actual LAT': original_actuals[:, 0],
        'Actual LON': original_actuals[:, 1],
        'Predicted LAT': original_predictions[:, 0],
        'Predicted LON': original_predictions[:, 1],
    })

    comparison_df['Haversine Distance (km)'] = haversine(
        comparison_df['Actual LAT'], comparison_df['Actual LON'],
        comparison_df['Predicted LAT'], comparison_df['Predicted LON']
    )

    return comparison_df

In [16]:
def verify_scaling_and_rescaling(group_test, scaler_features, scaler_targets):
    """
    Verifies if the scaling and rescaling for features and targets are consistent.

    Parameters:
        group_test (DataFrame): The test data containing LAT, LON, SOG, and COG.
        scaler_features (MinMaxScaler): Pre-fitted scaler used for features (LAT, LON, SOG, COG).
        scaler_targets (MinMaxScaler): Pre-fitted scaler used for target values (LAT, LON).
    """
    feature_columns = ['LAT', 'LON', 'SOG', 'COG']
    target_columns = ['LAT', 'LON']

    scaled_features = scaler_features.transform(group_test[feature_columns])
    rescaled_features = scaler_features.inverse_transform(scaled_features)

    scaled_targets = scaler_targets.transform(group_test[target_columns])
    rescaled_targets = scaler_targets.inverse_transform(scaled_targets)

    feature_comparison = pd.DataFrame(rescaled_features, columns=feature_columns)
    feature_comparison['Original LAT'] = group_test['LAT'].values
    feature_comparison['Original LON'] = group_test['LON'].values
    feature_comparison['LAT Difference'] = feature_comparison['Original LAT'] - feature_comparison['LAT']
    feature_comparison['LON Difference'] = feature_comparison['Original LON'] - feature_comparison['LON']

    inconsistent_features = feature_comparison[
        (feature_comparison['LAT Difference'].abs() > 1e-6) |
        (feature_comparison['LON Difference'].abs() > 1e-6)
    ]

    print("\nChecking Scaling and Rescaling Consistency for Features:")
    print(f"Number of inconsistent rows in features: {len(inconsistent_features)}")

    if not inconsistent_features.empty:
        print("Sample inconsistencies for features:")
        print(inconsistent_features.head())

    target_comparison = pd.DataFrame(rescaled_targets, columns=target_columns)
    target_comparison['Original LAT'] = group_test['LAT'].values
    target_comparison['Original LON'] = group_test['LON'].values
    target_comparison['LAT Difference'] = target_comparison['Original LAT'] - target_comparison['LAT']
    target_comparison['LON Difference'] = target_comparison['Original LON'] - target_comparison['LON']

    inconsistent_targets = target_comparison[
        (target_comparison['LAT Difference'].abs() > 1e-6) |
        (target_comparison['LON Difference'].abs() > 1e-6)
    ]

    print("\nChecking Scaling and Rescaling Consistency for Targets:")
    print(f"Number of inconsistent rows in targets: {len(inconsistent_targets)}")

    if not inconsistent_targets.empty:
        print("Sample inconsistencies for targets:")
        print(inconsistent_targets.head())

    return {
        "feature_comparison": feature_comparison,
        "inconsistent_features": inconsistent_features,
        "target_comparison": target_comparison,
        "inconsistent_targets": inconsistent_targets
    }

### Training

In [17]:
mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')
mlflow.set_experiment("Final LAT_LON Pred Baseline")

group_count = df_train['VesselGroup'].nunique()

avg_haversine_sum = 0

avg_haversine_tanker = 0
avg_haversine_fishing = 0
avg_haversine_cargo = 0

average_haversine_distances = []

In [None]:
vessel_groups = df_train['VesselGroup'].unique()
num_classes = df_train['VesselGroup'].nunique()
print(num_classes)
print("Vessel groups found:", vessel_groups)

In [19]:
def create_lstm_model(input_shape, lstm_units):
    """
    Creates an LSTM model for predicting LAT and LON.

    Parameters:
        input_shape (tuple): The shape of the input data (window_size, num_features).
        lstm_units (int): Number of LSTM units.

    Returns:
        model (keras.Model): Compiled LSTM model.
    """
    model = Sequential()
    model.add(LSTM(lstm_units, activation='relu', input_shape=input_shape, return_sequences=True))
    model.add(LSTM(lstm_units, activation='relu'))
    model.add(Dense(2))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

In [None]:
with mlflow.start_run(run_name=run_name) as run:
    print("\n===== Processing All Vessel Groups Together =====")

    train_data = scaled_train
    test_data = scaled_test
    val_data = scaled_val

    X_train, y_train = preprocess_combined_data(train_data, window_size, step_size)
    X_test, y_test = preprocess_combined_data(test_data, window_size, step_size)
    X_val, y_val = preprocess_combined_data(val_data, window_size, step_size)

    print("\nDEBUG: Dataset shapes after preprocessing:")
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_test shape: {y_test.shape}")
    print(f"X_val shape: {X_val.shape}")
    print(f"y_val shape: {y_val.shape}")


    input_shape = (X_train.shape[1], X_train.shape[2])
    model = create_lstm_model(input_shape, lstm_units)
    model.summary()

    model_architecture = model.to_json()
    file_path = "model_architecture_reg.json"
    with open(file_path, "w") as f:
        f.write(model_architecture)

    train_data.to_csv("df_train_reg.csv", index=False)

    mlflow.log_param("epochs", epochs)
    mlflow.log_param("window_size", window_size)
    mlflow.log_param("step size", step_size)
    mlflow.log_param("lstm_units", lstm_units)
    mlflow.log_param("num_classes", num_classes)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("train_size", train_size)
    mlflow.log_param("test_size", test_size)
    mlflow.log_param("val_size", val_size)
    mlflow.log_param("rows_sum", rows_sum)
    mlflow.log_param("num_aug", num_aug)
    mlflow.log_param("aug_type", augmentation_type)
    mlflow.log_param("seed", seed)

    mlflow.keras.log_model(model, "all_vessel_groups_model")
    mlflow.log_artifact("df_train_reg.csv")
    mlflow.log_artifact(file_path)

    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        verbose=1
        )

    for epoch, loss in enumerate(history.history['loss']):
        mlflow.log_metric("Training Loss", loss, step=epoch)
    for epoch, val_loss in enumerate(history.history['val_loss']):
        mlflow.log_metric("Validation Loss", val_loss, step=epoch)

    print("\nPredicting for All Vessel Groups:")
    y_pred = model.predict(X_test, batch_size=32)
    print(f"Number of samples in X_test: {len(X_test)}")
    print(f"Number of predictions: {len(y_pred)}")

    comparison_df = generate_comparison_df(y_test, y_pred, test_data, scaler_targets)
    assert len(comparison_df) == len(y_pred), "Mismatch between comparison_df and predictions"

    display(comparison_df)

    avg_haversine = comparison_df['Haversine Distance (km)'].mean()
    mlflow.log_metric("Average Haversine Distance", avg_haversine)
    print(f"Average Haversine Distance: {avg_haversine:.4f} km")

    large_haversine = 50
    large_distances = comparison_df[comparison_df['Haversine Distance (km)'] > large_haversine]
    print()
    print(f"Rows with extreme large haversine distances greater than {large_haversine}:")
    display(large_distances)

    os.remove("df_train_reg.csv")
    os.remove("model_architecture_reg.json")

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.show()