In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import LineString
from prediction.data import vessel_groups
from prediction.preprocessing import load_and_build, remove_outliers_parallel
from prediction.preprocessing.trajectory_resampling import compare_trajectory_pairs, resample_trajectories
from prediction.visualization import plot_north_america, plot_trajectories

# Step 1: Resample Trajectories
def resample_data(train_df, test_df, interval_minutes=5):
    """
    Resample trajectories in train and test DataFrames to fixed intervals.
    
    Args:
        train_df: Training dataset as a DataFrame.
        test_df: Test dataset as a DataFrame.
        interval_minutes: Interval in minutes for resampling.
        
    Returns:
        Resampled train and test DataFrames.
    """
    resampled_train_df = resample_trajectories(train_df, interval_minutes=interval_minutes)
    resampled_test_df = resample_trajectories(test_df, interval_minutes=interval_minutes)
    return resampled_train_df, resampled_test_df

# Step 2: Split into Backward and Forward Trajectories
def split_trajectory(row, backward_time=1800, forward_time=1800):
    """
    Split a trajectory into backward and forward segments.
    
    Args:
        row: A single trajectory row containing timestamps and geometry.
        backward_time: Time (in seconds) to look backward.
        forward_time: Time (in seconds) to look forward.
        
    Returns:
        A tuple of DataFrames: (backward trajectory, forward trajectory).
    """
    timestamps = row["timestamps"]
    midpoint_time = timestamps[len(timestamps) // 2]

    # Define backward and forward ranges
    backward_start = max(midpoint_time - backward_time, timestamps[0])
    forward_end = min(midpoint_time + forward_time, timestamps[-1])

    # Filter indices for backward and forward trajectories
    backward_indices = [i for i, t in enumerate(timestamps) if t <= midpoint_time and t >= backward_start]
    forward_indices = [i for i, t in enumerate(timestamps) if t >= midpoint_time and t <= forward_end]

    # Create backward and forward trajectories
    backward_row = row.copy()
    forward_row = row.copy()
    backward_row["geometry"] = LineString([row["geometry"].interpolate(i).coords[0] for i in backward_indices])
    forward_row["geometry"] = LineString([row["geometry"].interpolate(i).coords[0] for i in forward_indices])

    return backward_row, forward_row

def split_trajectories(df, backward_time=1800, forward_time=1800):
    """
    Apply backward/forward splitting to all rows in a DataFrame.
    
    Args:
        df: Input DataFrame with trajectories.
        backward_time: Time (in seconds) for backward split.
        forward_time: Time (in seconds) for forward split.
        
    Returns:
        Two DataFrames: (backward trajectories, forward trajectories).
    """
    backward, forward = zip(*df.apply(split_trajectory, axis=1))
    return pd.DataFrame(backward), pd.DataFrame(forward)

# Step 3: Normalize Features
def normalize_features(df, columns=["latitude", "longitude", "speed", "course"]):
    """
    Normalize specified features in the DataFrame.
    
    Args:
        df: Input DataFrame.
        columns: List of column names to normalize.
        
    Returns:
        Normalized DataFrame.
    """
    normalized_df = df.copy()
    for col in columns:
        if col in normalized_df:
            normalized_df[col] = (normalized_df[col] - normalized_df[col].mean()) / normalized_df[col].std()
    return normalized_df

# Step 4: Save Data
def save_preprocessed_data(df, filename):
    """
    Save the preprocessed DataFrame to a pickle file.
    
    Args:
        df: DataFrame to save.
        filename: Path to the output pickle file.
    """
    df.to_pickle(filename)

# Main Function for Data Preparation
def prepare_data(train_df, test_df, interval_minutes=5, backward_time=1800, forward_time=1800):
    """
    Complete data preparation pipeline.
    
    Args:
        train_df: Training dataset as a DataFrame.
        test_df: Test dataset as a DataFrame.
        interval_minutes: Interval for resampling (in minutes).
        backward_time: Backward time range for splitting (in seconds).
        forward_time: Forward time range for splitting (in seconds).
    
    Returns:
        Preprocessed train and test datasets: (backward_train, forward_train, backward_test, forward_test).
    """
    # Step 1: Resample trajectories
    print("Resampling trajectories...")
    resampled_train_df, resampled_test_df = resample_data(train_df, test_df, interval_minutes=interval_minutes)

    # Step 2: Split into backward and forward trajectories
    print("Splitting trajectories into backward and forward segments...")
    backward_train, forward_train = split_trajectories(resampled_train_df, backward_time=backward_time, forward_time=forward_time)
    backward_test, forward_test = split_trajectories(resampled_test_df, backward_time=backward_time, forward_time=forward_time)

    # Step 3: Normalize features
    print("Normalizing features...")
    normalized_backward_train = normalize_features(backward_train)
    normalized_forward_train = normalize_features(forward_train)
    normalized_backward_test = normalize_features(backward_test)
    normalized_forward_test = normalize_features(forward_test)

    # Step 4: Save preprocessed data
    print("Saving preprocessed data...")
    save_preprocessed_data(normalized_backward_train, "backward_train.pkl")
    save_preprocessed_data(normalized_forward_train, "forward_train.pkl")
    save_preprocessed_data(normalized_backward_test, "backward_test.pkl")
    save_preprocessed_data(normalized_forward_test, "forward_test.pkl")

    print("Data preparation completed!")
    return normalized_backward_train, normalized_forward_train, normalized_backward_test, normalized_forward_test

# Example usage:
if __name__ == "__main__":
    # Load your train and test DataFrames
    train_df = pd.read_pickle("2024-03-01_2024-03-31_filtered.pkl")
    test_df = pd.read_pickle("2024-04-01_2024-04-30_filtered.pkl")

    # Prepare data
    backward_train, forward_train, backward_test, forward_test = prepare_data(train_df, test_df)


Resampling trajectories...


  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)


Splitting trajectories into backward and forward segments...
Normalizing features...
Saving preprocessed data...
Data preparation completed!
