# Step 2b: Extract Near Miss Sequences (Event Timestamp)

In [1]:
# Imports
import pandas as pd
from datetime import timedelta
from google.cloud import bigquery
import os

## GCP Setup:

In [2]:
# # Google Cloud Authentication: 
assert os.system('gcloud auth application-default login --quiet') == 0

In [3]:
# # Insert your project ID here:
PROJECT_ID = "ford-5bba11084fd31e17ec109f0c"

In [4]:
assert os.system(f"gcloud config set core/project {PROJECT_ID}") == 0

In [5]:
!gcloud config list

[core]
disable_usage_reporting = True
project = ford-5bba11084fd31e17ec109f0c



Your active configuration is: [default]


## Query: BlueCruise Data from claims VINs:
- Gets any live data surrounding (+- 1 day) the impact time for each vin
- BlueCruise 1Hz Data

In [None]:
/* Based on this original query in BigQuery: */
/*Had some issues with duplicates but should be easy to fix/ filter */
SELECT 
    /* Identifiers */
    b.dpefa_bc_007_vin_17_x AS vin,
    b.dpefa_bc_007_trip_d AS trip_id,

    /* Time */
    b.dpefa_bc_007_event_local_m AS date_time,

    /* Driving Characteristics */
    b.dpefa_bc_007_trffc_jam_asst_stat_x AS veh_assist_mode,
    b.dpefa_bc_007_hst_veh_long_vlcy_r AS veh_long_vel_mps,
    b.dpefa_bc_007_hst_veh_ltrl_vlcy_r AS veh_ltrl_vel_mps,
    b.dpefa_bc_007_hst_veh_yaw_rate_r AS veh_yaw_rate_radps,
    b.dpefa_bc_007_acc_mps2 AS veh_accel_mps2,
    b.dpefa_bc_007_jerk_mps3 AS veh_jerk_mps3

FROM `prj-dfad-31-usrda-p-31.dlobo1_bluecruise.gdpefa_adas_bc_ada_lm_vw` b

JOIN `ford-5bba11084fd31e17ec109f0c.GATech.bc_impact_timestamps_2021_2024` i
ON b.dpefa_bc_007_vin_17_x = i.VIN

WHERE 
    b.dpefa_bc_007_event_m BETWEEN TIMESTAMP_SUB(i.MODEM_TIMESTAMP, INTERVAL 1 DAY) AND TIMESTAMP_ADD(i.MODEM_TIMESTAMP, INTERVAL 1 DAY)
    AND DATE(b.dpefa_bc_007_event_m) BETWEEN DATE_SUB(DATE(i.MODEM_TIMESTAMP), INTERVAL 1 DAY) 
    AND DATE_ADD(DATE(i.MODEM_TIMESTAMP), INTERVAL 1 DAY)
    AND DATE(b.dpefa_bc_007_event_m) IS NOT NULL
    AND b.dpefa_bc_007_event_m IS NOT NULL

In [7]:
# Filtered the original query to remove duplicates and saved in project 
q = """
SELECT *
FROM `ford-5bba11084fd31e17ec109f0c.GATech.bluecruise_impacts_1hz_21-24`
"""

In [8]:
# Excecute the query to get 1Hz data
client = bigquery.Client()
df = client.query(q)
df_bc = df.to_dataframe()

## Query: Timestamp of Impacts Dataset
- Only the VINs that are also in the BC dataset

In [None]:
/* Based on these queries */
SELECT t1.*
FROM `ford-95bec5f0488dddd4a3227e1b.Ford_Pro.fias_management_impact_event_extract_09302024` AS t1
INNER JOIN (
    SELECT dpefa_bc_007_vin_17_x AS vin
    FROM `prj-dfad-31-usrda-p-31.dlobo1_bluecruise.gdpefa_adas_bc_ada_lm_vw`
    WHERE dpefa_bc_007_event_m BETWEEN '2021-01-01' AND '2024-10-01'
) AS t2 ON t1.VIN = t2.vin;

/*Then saved these results to another dataset in BigQuery and ran this next query*/
SELECT VIN,
MODEM_TIMESTAMP,
TIME_OF_DAY, 
LOCAL_TIMESTAMP, 
RSTR_IMPACT_EVENT 
FROM `ford-5bba11084fd31e17ec109f0c.GATech.temp_impact_bc2`

In [10]:
q2= """
SELECT *
FROM `ford-5bba11084fd31e17ec109f0c.GATech.bc_impact_timestamps_2021_2024`
"""

In [11]:
# Excecute the query to get the impact time stamps
client = bigquery.Client()
df = client.query(q2)
df_impact_timestamps = df.to_dataframe()

## Data Formatting

In [14]:
# Change all the column names to lower case
df_bc.columns = map(str.lower, df_bc.columns)
df_impact_timestamps.columns = map(str.lower, df_impact_timestamps.columns)

# Convert the datetime objects with ISO8601 format
df_bc['date_time'] = pd.to_datetime(df_bc['date_time'], format='ISO8601') # Convert the date_time column to a datetime object with ISO8601 format
df_impact_timestamps['modem_timestamp'] = pd.to_datetime(df_impact_timestamps['modem_timestamp'], format='ISO8601') # Convert the impact_timestamp column to a datetime object with ISO8601 format

## Impact Detection (Using Event Timestamps Dataset)
- Define how close to the event timestamp the 1Hz data can be considered to have recorded the impact (Tolerance)
- Set the tolerance

In [12]:
# Define tolerance (e.g. 1 minute) [NOTE: This is arbitrary, can be tuned]
tolerance = timedelta(minutes=1)

In [28]:
# Function to check if impact timestamps are within tolerance in 1Hz data
def check_impact_timestamps(one_hz_data, impact_data, tolerance):
    results = []
    for trip_id in one_hz_data['trip_id'].unique():
        
        trip_one_hz_data = one_hz_data[one_hz_data['trip_id'] == trip_id]
        vin = trip_one_hz_data['vin'].iloc[0]

        trip_impact_data = impact_data[impact_data['vin'] == vin]
        
        for impact_timestamp in trip_impact_data['modem_timestamp']:
            found = any(abs(impact_timestamp - one_hz_timestamp) <= tolerance for one_hz_timestamp in trip_one_hz_data['date_time'])
            results.append({
                'trip_id': trip_id,
                'vin': vin,
                'impact_timestamp': impact_timestamp,
                'found_in_1Hz_data': found
            })
    
    return pd.DataFrame(results)

In [29]:
# Check impact timestamps
results_df = check_impact_timestamps(df_bc, df_impact_timestamps, tolerance)

In [30]:
# Print out the true results rows
impact_detected_df = results_df[results_df['found_in_1Hz_data'] == True]

## Near Miss Extraction
- Extract X second near miss sequences before the impact time
- X = 10 seconds [This can be tuned]
- Uncertainty window: 3 seconds before and after the impact time [This can be tuned]

In [None]:
# Set window size variables:
X = 10 # X-second sequence/ window size
Y = 3 # Y-second uncertainty window size (The uncertainty of when the impact actually occurs)

In [None]:
df_bc['seg_num_id'] = 0  # Initialize the segment number identifier
segment_count = 0  # Initialize the segment counter for this trip

# Create an empty DataFrame to store the window segments
windows_df = pd.DataFrame()

for trip_num in impact_detected_df['trip_id'].unique():
    trip_data = df_bc[df_bc['trip_id'] == trip_num].copy()
    impact_time = impact_detected_df[impact_detected_df['trip_id'] == trip_num]['impact_timestamp']
    
    # Process multiple windows around each impact
    for offset in range(-Y, Y + 1):
        window_start = impact_time - pd.Timedelta(seconds=X + 2) + pd.Timedelta(seconds=offset)
        window_end = impact_time - pd.Timedelta(seconds=2) + pd.Timedelta(seconds=offset)
        window = trip_data[(trip_data['date_time'] >= window_start) & (trip_data['date_time'] < window_end)].copy()
        segment_count += 1
        window['seg_num_id'] = segment_count  # Assign unique segment number within the trip
        windows_df = pd.concat([windows_df, window])

# Reset index for the windows DataFrame
windows_df.reset_index(drop=True, inplace=True)

## Save the Near Miss Sequences

In [None]:
# Data Cleaning:
windows_df = windows_df.groupby('seg_num_id').filter(lambda x: len(x) >= 5) # Drop any segments that have less than 5 seconds of data based on seg_num_id
windows_df.drop(columns=['impact_indicator'], inplace=True) # Drop impact_indicator column and save the windows_df to a CSV file

In [None]:
# Save the cleaned near miss sequences to a CSV file
windows_df.to_csv('data/df_near_miss_windows_event_ts.csv', index=False)