# Step 4: Run Near Miss Algorithm

In [1]:
# Imports
import numpy as np
import pandas as pd
import time
import pickle

from joblib import Parallel, delayed
import near_miss_algorithms.near_miss_v3 as nma

## Load Datasets

### Complete Trips Dataset (BlueCruise [From step 1])

In [None]:
df_bc = pd.read_csv('data/df_bc_trips_PLACEHOLDER.csv')

### Near Miss Sequences Dataset (From step 3)


In [None]:
windows_df = pd.read_csv('data/df_near_miss_windows_combined1.csv')

## Data Preprocessing

In [None]:
## Trip Duration Dataframe

# Convert the date_time column to datetime format
df_bc['date_time'] = pd.to_datetime(df_bc['date_time'], format='ISO8601')

## Sequence Identification Functions
- Utilitize Developed Near Miss Algorithm (keep python code in near_miss_algorithms folder)
- Near Miss Algorithm Repo: https://github.com/DonavenLobo/FordxGatech_NearMissAlgo_Dev


In [None]:
# Function to calculate the combined distance profile
def compute_combined_distance(query_df, trip_df, columns):
    combined_distance_profile = nma.near_miss(trip_df[columns[0]].values, query_df[columns[0]].values)  # Initialize combined distance profile

    # Loop through each column and compute the distance profile
    for col in columns:
        if col == columns[0]:
            continue
        distance_profile = nma.near_miss(trip_df[col].values, query_df[col].values)
        combined_distance_profile += distance_profile  # Sum the distance profiles

    return combined_distance_profile

In [None]:
# Function to compute MASS for a single trip-query pair
def compute_mass_for_trip(query_df, trip_df, query_num, trip_num, time_series_columns):
    query_length = len(query_df)
    trip_length = len(trip_df)

    # Skip if the trip is shorter than the query
    if trip_length < query_length:
        return None

    # Compute the combined distance profile
    combined_distance_profile = compute_combined_distance(query_df, trip_df, time_series_columns)

    # Remove nan values in the distance profile
    # Remove trailing NaN values from the combined distance profile
    first_nan_index = np.where(np.isnan(combined_distance_profile))[0][0] if np.any(np.isnan(combined_distance_profile)) else len(combined_distance_profile)
    combined_distance_profile = combined_distance_profile[:first_nan_index]

    # Compute the minimum  distance
    min_distance = np.min(combined_distance_profile)

    # Return the result as a dictionary
    return {
        'trip_num': trip_num,
        'query_num': query_num,
        'distance_profile': combined_distance_profile,
        'min_distance': min_distance
    }

In [None]:
# Optimized function to get distance profiles for all trips in parallel
def get_distance_profiles_parallel(windows_df, df_bc, time_series_columns, n_jobs=-1):
    results = []  # List to accumulate results

    # Loop over each query
    for query_num in windows_df['seg_num_id'].unique():
        query_df = windows_df[windows_df['seg_num_id'] == query_num].copy()
        query_trip_num = query_df['trip_num'].values[0]

        # Use Parallel to process each trip in parallel
        trip_results = Parallel(n_jobs=n_jobs)(
            delayed(compute_mass_for_trip)(query_df, df_bc[df_bc['trip_num'] == trip_num].copy(), query_num, trip_num, time_series_columns)
            for trip_num in df_bc['trip_num'].unique()
            if trip_num != query_trip_num  # Skip the query's own trip
        )

        # Filter out None results (trips that were skipped)
        trip_results = [result for result in trip_results if result is not None]

        # Append the valid results to the main results list
        results.extend(trip_results)

    # Convert accumulated results into a DataFrame
    distance_profiles_dict = results

    return distance_profiles_dict

## Run Near Miss Algorithm (nma)
- Input: windows_df, df_bc, time_series_columns
- Output: distance_profiles_dict


In [None]:
# The initial columns to compare in the time series data
time_series_columns = ['veh_long_vel_mps', 'veh_accel_mps2', 'veh_ltrl_vel_mps', 'veh_yaw_rate_radps', 'veh_jerk_mps3']

In [None]:
start_time = time.time()

distance_profiles = get_distance_profiles_parallel(windows_df, df_bc, time_series_columns)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

Execution time: 5303.12 seconds


## Save the output dictionary
- Dictionary Fields: trip_num (int), query_num (int), distance_profile (np array), min_distance (float)


In [None]:
# Save the distance profiles dictionary to a file - Rename to appropriate name
with open('data/nma_distance_profiles_dict.pkl', 'wb') as f:
    pickle.dump(distance_profiles, f)