# Step 2b: Extract Near Miss Sequences (Fingerprint)

In [1]:
# Imports
import pandas as pd
import matplotlib
matplotlib.use('nbagg')
%matplotlib inline
from google.cloud import bigquery
import os

## GCP Setup:

In [4]:
# # Google Cloud Authentication: 
assert os.system('gcloud auth application-default login --quiet') == 0

In [5]:
# # Insert your project ID here:
PROJECT_ID = "ford-5bba11084fd31e17ec109f0c"

In [6]:
assert os.system(f"gcloud config set core/project {PROJECT_ID}") == 0

In [7]:
!gcloud config list

[core]
disable_usage_reporting = True
project = ford-5bba11084fd31e17ec109f0c



Your active configuration is: [default]


## Query:
- Get the driving characteristics of the top [X] vins most active BlueCruise Vins that have a collision claim
- Between [DATES OF INTEREST]
- Initially looked at dates between 2023-01-01 and 2023-08-01 & the top 20 vins with the most data


In [8]:
q = """
SELECT 
/* Identifiers */
dpefa_bc_007_trip_d as trip_id,

/*Time*/
dpefa_bc_007_event_local_m as date_time,

/* Driving Characteristics */
dpefa_bc_007_hst_veh_long_vlcy_r as veh_long_vel_mps,
dpefa_bc_007_hst_veh_ltrl_vlcy_r as veh_ltrl_vel_mps,
dpefa_bc_007_hst_veh_yaw_rate_r as veh_yaw_rate_radps,
dpefa_bc_007_acc_mps2 as veh_accel_mps2,
dpefa_bc_007_jerk_mps3 as veh_jerk_mps3,


FROM `prj-dfad-31-usrda-p-31.dlobo1_bluecruise.gdpefa_adas_bc_ada_lm_vw` /* Change this to your dataset */
/* Make sure BC vins are in claims table */
WHERE dpefa_bc_007_vin_17_x in (
  SELECT DISTINCT bc.dpefa_bc_007_vin_17_x
  FROM `prj-dfad-31-usrda-p-31.dlobo1_bluecruise.gdpefa_adas_bc_ada_lm_vw` as bc
  INNER JOIN `ford-5bba11084fd31e17ec109f0c.GDIA_Credit.Management_Lease_All_Claims_Sep25`as claims
  ON bc.dpefa_bc_007_vin_17_x = claims.VIN
  WHERE bc.dpefa_bc_007_event_m BETWEEN '2023-1-1' AND '2023-8-1' /* SET DATES OF INTEREST */
  AND claims.CauseCode = 'Collision'
)
AND dpefa_bc_007_event_m BETWEEN '2023-1-1' AND '2023-8-1'
AND dpefa_bc_007_vin_17_x IN (SELECT
  dpefa_bc_007_vin_17_x AS vin,
  FROM `prj-dfad-31-usrda-p-31.dlobo1_bluecruise.gdpefa_adas_bc_ada_lm_vw`
  WHERE dpefa_bc_007_vin_17_x IN (
    SELECT DISTINCT bc.dpefa_bc_007_vin_17_x
    FROM `prj-dfad-31-usrda-p-31.dlobo1_bluecruise.gdpefa_adas_bc_ada_lm_vw` AS bc
    INNER JOIN `ford-5bba11084fd31e17ec109f0c.GDIA_Credit.Management_Lease_All_Claims_Sep25` AS claims
    ON bc.dpefa_bc_007_vin_17_x = claims.VIN
    WHERE bc.dpefa_bc_007_event_m BETWEEN '2023-1-1' AND '2023-8-1' /* SET DATES OF INTEREST */
    AND claims.CauseCode = 'Collision'
  )
  AND dpefa_bc_007_event_m BETWEEN '2023-1-1' AND '2023-8-1' /* SET DATES OF INTEREST */
  GROUP BY dpefa_bc_007_vin_17_x
  ORDER BY COUNT(dpefa_bc_007_vin_17_x) DESC
  LIMIT 20) /* SET NUMBER OF VINS OF INTEREST (Top X number of Vins with most BC data) */
"""

In [9]:
# Excecute the query
client = bigquery.Client()
df = client.query(q)
df_bc = df.to_dataframe()

## Data Formatting

In [None]:
# Data Cleaning:

df_bc['date_time'] = pd.to_datetime(df_bc['date_time'], format='ISO8601') # Convert the date_time column to a datetime object with ISO8601 format

In [11]:
# Saving query df to a csv file
df.to_csv('data/bc_claims_top20_jan_aug.csv', index = False)

## Impact Detection (Fingerprint Method)


### Find impacts based on velocity dropping to zero for a duration of X seconds
- X = 280 seconds (4.67 minutes) [This can be tuned]


In [30]:
def detect_impacts(df, zero_speed_duration_threshold):
    """
    Detects trips with potential impacts based on velocity dropping to zero.

    Args:
    df: DataFrame containing trip data.
    zero_speed_duration_threshold: The number of consecutive seconds where the vehicle speed must be zero to consider it an impact.

    Returns:
    A list of trip numbers that have potential impacts.
    """

    # Define a function to identify if there is a stop in the trip
    def is_impact(trip_data):
        trip_data = trip_data.sort_values('date_time')
        zero_velocities = trip_data['veh_long_vel_mps'] == 0
        zero_velocities_duration = zero_velocities.groupby((zero_velocities != zero_velocities.shift()).cumsum()).cumsum()
        return any(zero_velocities_duration >= zero_speed_duration_threshold)

    # Group by trip number and apply the detection function
    impacted_trips = df.groupby('trip_id').apply(is_impact)
    
    # Return trip numbers where an impact is detected
    return impacted_trips[impacted_trips].index.tolist()

In [49]:
# Find the trip numbers with potential impacts
impacted_trips = detect_impacts(df_bc, zero_speed_duration_threshold=280)
print(f"Trips with potential impacts: {impacted_trips}")

Trips with potential impacts: [2672, 2791, 3612, 5039, 5159]


  impacted_trips = df.groupby('trip_num').apply(is_impact)


### Label the impact indicator for the impacted trips
- The impact indicator is set to 1 for the point in time with the maximum acceleration in the impact window
- impact_indicator = 0 for all other points in time
- impact_indicator is set to 0 for all trips that do not have an impact
- impact window: 1 minute before the vehicle stops to the stop time [This can be tuned]


In [40]:
# Create a new column called 'impact_indicator' that is initially set to 0 in the BlueCruise dataframe
df_bc['impact_indicator'] = 0


for trip_num in impacted_trips:
    trip_data = df_bc[df_bc['trip_id'] == trip_num]
    # Check if there are any zero velocity points in the data
    if (trip_data['veh_long_vel_mps'] == 0).any():
        # Find the earliest time when the vehicle velocity drops to zero
        stop_time = trip_data[trip_data['veh_long_vel_mps'] == 0]['date_time'].min()
        # Define the impact window: from one minute before the vehicle stops to the stop time
        start_time = stop_time - pd.Timedelta(minutes=1)
        
        # Filter the data to the last minute before the vehicle stops
        impact_window = trip_data[(trip_data['date_time'] >= start_time) & (trip_data['date_time'] <= stop_time)]
        
        # Find the time of the maximum absolute acceleration in this window
        max_accel_idx = impact_window['veh_accel_mps2'].abs().idxmax()
        
        # Get the date_time for the max acceleration
        if not impact_window.empty:
            impact_time = impact_window.loc[max_accel_idx, 'date_time']
            # Set the impact indicator to 1 for the row with the maximum acceleration
            df_bc.loc[max_accel_idx, 'impact_indicator'] = 1

## Near Miss Extraction
- Extract X second near miss sequences before the impact time
- X = 10 seconds [This can be tuned]
- Uncertainty window: 3 seconds before and after the impact time [This can be tuned]



In [None]:
# Set window size variables:
X = 10 # X-second sequence/ window size
Y = 3 # Y-second uncertainty window size (The uncertainty of when the impact actually occurs)

In [None]:
df_bc['seg_num_id'] = 0  # Initialize the segment number identifier
segment_count = 0  # Initialize the segment counter for this trip

# Create an empty DataFrame to store the window segments
windows_df = pd.DataFrame()

for trip_num in impacted_trips:
    trip_data = df_bc[df_bc['trip_id'] == trip_num].copy()
    impact_times = trip_data[trip_data['impact_indicator'] == 1]['date_time']
    
    # Process multiple windows around each impact
    for impact_time in impact_times:
        for offset in range(-Y, Y + 1):
            window_start = impact_time - pd.Timedelta(seconds=X + 2) + pd.Timedelta(seconds=offset)
            window_end = impact_time - pd.Timedelta(seconds=2) + pd.Timedelta(seconds=offset)
            window = trip_data[(trip_data['date_time'] >= window_start) & (trip_data['date_time'] < window_end)].copy()
            segment_count += 1
            window['seg_num_id'] = segment_count  # Assign unique segment number within the trip
            windows_df = pd.concat([windows_df, window])

# Reset index for the windows DataFrame
windows_df.reset_index(drop=True, inplace=True)

## Save the near miss sequences


In [None]:
# Data Cleaning:
windows_df = windows_df.groupby('seg_num_id').filter(lambda x: len(x) >= 5) # Drop any segments that have less than 5 seconds of data based on seg_num_id
windows_df.drop(columns=['impact_indicator'], inplace=True) # Drop impact_indicator column and save the windows_df to a CSV file

In [None]:
# Save the cleaned near miss sequences to a CSV file
windows_df.to_csv('data/df_near_miss_windows_fingerprint.csv', index=False)