In [1]:
import os
import pandas as pd

def expand_and_combine_with_unified_files(unified_files, corresponding_folders, excluded_parameters):
    """
    Expands each already existing unified CSV file with additional features from topics like detected objects and waypoints and extracts the data 5 seconds before each disengagement event, saving it to a separate CSV file in the corresponding folder.
    
    Parameters:
        unified_files (list): List of paths to unified CSV files to expand.
        corresponding_folders (list): List of paths to the folders containing additional CSV files for each unified file.
        excluded_parameters (list): List of column substrings to exclude from additional files.
        
    Returns:
        None
    """
    # Zip unified files and corresponding folders together to process them in parallel
    for unified_file, corresponding_folder_path in zip(unified_files, corresponding_folders):
        
        # Check if the corresponding folder exists
        if not os.path.isdir(corresponding_folder_path):
            print(f"Folder {corresponding_folder_path} not found. Skipping this file.")
            continue
        
        # Load the unified CSV file
        unified_df = pd.read_csv(unified_file)
        if 'Time' not in unified_df.columns or unified_df['Time'].isna().all():
            print(f"'Time' column is missing or contains all NaN values in file: {unified_file}")
            continue

        if 'Time' in unified_df.columns:
            # Check if 'Time' is already in a datetime64 format
            if not pd.api.types.is_datetime64_any_dtype(unified_df['Time']):
                # Convert only if it is not already datetime64
                unified_df['Time'] = pd.to_datetime(unified_df['Time'], errors='coerce')
                print(unified_df['Time'].isna().sum(), "NaT values found after datetime conversion.")
            
            unified_df.set_index('Time', inplace=True)

        else:
            print(f"Unified file {unified_file} does not contain a 'Time' column. Skipping.")
            continue

        # Collect additional files from the corresponding folder
        additional_dataframes = []
        
        for file_name in os.listdir(corresponding_folder_path):
            file_path = os.path.join(corresponding_folder_path, file_name)

            if file_name.endswith('.csv') and file_path != unified_file:
                try:
                    # Load the additional CSV file
                    additional_df = pd.read_csv(file_path, low_memory=False)

                    if 'Time' not in additional_df.columns or additional_df['Time'].isna().all():
                        print(f"'Time' column is missing or contains all NaN values in file: {file_name}")
                        continue

                    
                    # Remove excluded columns
                    additional_df = additional_df.loc[:, ~additional_df.columns.str.contains('|'.join(excluded_parameters))]
                    
                    # Set 'Time' column as the index
                    if 'Time' in additional_df.columns:
                        additional_df['Time'] = pd.to_datetime(additional_df['Time'], unit='s', errors='coerce')
                        print(f"After conversion, {additional_df['Time'].isna().sum()} NaT values found in file: {file_name}")
                        additional_df.set_index('Time', inplace=True)
                        additional_dataframes.append(additional_df)
                    else:
                        print(f"Additional file {file_name} does not contain a 'Time' column.")
                except Exception as e:
                    print(f"Error processing {file_name}: {e}")

        # Merge all additional DataFrames with the unified DataFrame
        if additional_dataframes:
            additional_combined_df = pd.concat(additional_dataframes, axis=0)  # Combine rows first
            additional_combined_df = additional_combined_df.sort_index()      # Ensure timestamps are in order
            
            # Drop duplicate rows for the same timestamp by merging columns
            additional_combined_df = additional_combined_df.groupby(level=0).first()
            
            # Combine with the unified DataFrame, ensuring no data loss
            unified_df = pd.concat([unified_df, additional_combined_df], axis=1, join='outer')
            print(f"NaT values in combined DataFrame: {unified_df.index.isna().sum()}")


        # Forward and backward fill missing values
        unified_df.ffill(inplace=True)
        unified_df.bfill(inplace=True)

        # Reset index for saving
        unified_df.reset_index(inplace=True)

        # Filter rows where 'disengagement' is 1 and the 5 seconds before the event
        disengagement_rows = unified_df[unified_df['disengagement'] == 1]
        for i, (idx, disengagement) in enumerate(disengagement_rows.iterrows(), 1):
            disengagement_time = disengagement['Time']
            
            # Get rows from 5 seconds before disengagement
            time_range_start = disengagement_time - pd.Timedelta(seconds=5)
            disengagement_data = unified_df[(unified_df['Time'] >= time_range_start) & (unified_df['Time'] <= disengagement_time)]
            
            # Generate the output file name
            original_filename = os.path.splitext(os.path.basename(unified_file))[0]
            output_file_name = f"{original_filename}_disengagement_{i}.csv"
            output_file_path = os.path.join(corresponding_folder_path, output_file_name)
            disengagement_data.to_csv(output_file_path, index=False)
            print(f"Disengagement data saved: {output_file_path}")


# Original unified files
unified_files = [
    'F:\\2023-10-16-10-14-49_tiksoja_ride_02_sfa_with_disengagements.csv',
    'F:\\2023-10-16-10-30-17_tiksoja_ride_03_cluster_with_disengagements.csv',
    'F:\\2023-10-16-10-50-24_tiksoja_ride_04_sfa_with_disengagements.csv',
    'F:\\2023-10-16-13-47-19_tiksoja_ride_05_cluster_with_disengagements.csv',
    'F:\\2023-10-16-15-23-20_tiksoja_ride_08_cluster_with_disengagements.csv',
    'F:\\2023-10-30-09-55-20_tiksoja_ride_09_sfa_split_2_with_disengagements.csv',
    'F:\\2023-10-30-10-21-51_tiksoja_ride_10_cluster_split_1_with_disengagements.csv',
    'F:\\2023-10-30-10-34-58_tiksoja_ride_10_cluster_split_2_with_disengagements.csv',
    'F:\\2023-10-30-10-53-17_tiksoja_ride_11_cluster_split_1_with_disengagements.csv',
    'F:\\2023-10-30-11-07-11_tiksoja_ride_11_cluster_split_2_with_disengagements.csv',
    'F:\\2023-10-30-14-28-38_tiksoja_ride_12_sfa_split_1_with_disengagements.csv',
    'F:\\2023-10-30-15-04-36_tiksoja_ride_13_cluster_split_1_with_disengagements.csv',
    'F:\\2023-10-30-15-19-32_tiksoja_ride_13_cluster_split_2_with_disengagements.csv',
    'F:\\2023-10-31-09-57-58_tiksoja_ride_14_sfa_split_1_with_disengagements.csv',
    'F:\\2023-10-31-10-17-12_tiksoja_ride_14_sfa_split_2_with_disengagements.csv',
    'F:\\2023-10-31-10-41-16_tiksoja_ride_15_cluster_split_1_with_disengagements.csv',
    'F:\\2023-10-31-10-59-27_tiksoja_ride_15_cluster_split_2_with_disengagements.csv',
    'F:\\2023-11-02-11-47-11_tiksoja_ride_16_sfa_split_1_with_disengagements.csv',
    'F:\\2023-11-02-12-00-24_tiksoja_ride_16_sfa_split_2_with_disengagements.csv',
    'F:\\2023-11-02-12-44-53_tiksoja_ride_17_cluster_split_1_with_disengagements.csv',
    'F:\\2023-11-02-12-58-32_tiksoja_ride_17_cluster_split_2_with_disengagements.csv',
    'F:\\2023-11-03-09-57-03_tiksoja_ride_18_sfa_split_1_with_disengagements.csv',
    'F:\\2023-11-03-10-21-22_tiksoja_ride_18_sfa_split_2_with_disengagements.csv',
    'F:\\2023-11-03-10-59-56_tiksoja_ride_19_cluster_split_1_with_disengagements.csv',
    'F:\\2023-11-03-11-14-53_tiksoja_ride_19_cluster_split_2_with_disengagements.csv',
    'F:\\2023-11-03-13-42-01_tiksoja_ride_20_sfa_split_1_with_disengagements.csv',
    'F:\\2023-11-03-13-56-06_tiksoja_ride_20_sfa_split_2_with_disengagements.csv'
]
# Corresponding folders with additional data
corresponding_folders = [
    'F:\\Processed_CSVs\\2023-10-16-10-14-49_tiksoja_ride_02_sfa',
    'F:\\Processed_CSVs\\2023-10-16-10-30-17_tiksoja_ride_03_cluster',
    'F:\\Processed_CSVs\\2023-10-16-10-50-24_tiksoja_ride_04_sfa',
    'F:\\Processed_CSVs\\2023-10-16-13-47-19_tiksoja_ride_05_cluster',
    'F:\\Processed_CSVs\\2023-10-16-15-23-20_tiksoja_ride_08_cluster',
    'F:\\Processed_CSVs\\2023-10-30-09-55-20_tiksoja_ride_09_sfa_split_2',
    'F:\\Processed_CSVs\\2023-10-30-10-21-51_tiksoja_ride_10_cluster_split_1',
    'F:\\Processed_CSVs\\2023-10-30-10-34-58_tiksoja_ride_10_cluster_split_2',
    'F:\\Processed_CSVs\\2023-10-30-10-53-17_tiksoja_ride_11_cluster_split_1',
    'F:\\Processed_CSVs\\2023-10-30-11-07-11_tiksoja_ride_11_cluster_split_2',
    'F:\\Processed_CSVs\\2023-10-30-14-28-38_tiksoja_ride_12_sfa_split_1',
    'F:\\Processed_CSVs\\2023-10-30-15-04-36_tiksoja_ride_13_cluster_split_1',
    'F:\\Processed_CSVs\\2023-10-30-15-19-32_tiksoja_ride_13_cluster_split_2',
    'F:\\Processed_CSVs\\2023-10-31-09-57-58_tiksoja_ride_14_sfa_split_1',
    'F:\\Processed_CSVs\\2023-10-31-10-17-12_tiksoja_ride_14_sfa_split_2',
    'F:\\Processed_CSVs\\2023-10-31-10-41-16_tiksoja_ride_15_cluster_split_1',
    'F:\\Processed_CSVs\\2023-10-31-10-59-27_tiksoja_ride_15_cluster_split_2',
    'F:\\Processed_CSVs\\2023-11-02-11-47-11_tiksoja_ride_16_sfa_split_1',
    'F:\\Processed_CSVs\\2023-11-02-12-00-24_tiksoja_ride_16_sfa_split_2',
    'F:\\Processed_CSVs\\2023-11-02-12-44-53_tiksoja_ride_17_cluster_split_1',
    'F:\\Processed_CSVs\\2023-11-02-12-58-32_tiksoja_ride_17_cluster_split_2',
    'F:\\Processed_CSVs\\2023-11-03-09-57-03_tiksoja_ride_18_sfa_split_1',
    'F:\\Processed_CSVs\\2023-11-03-10-21-22_tiksoja_ride_18_sfa_split_2',
    'F:\\Processed_CSVs\\2023-11-03-10-59-56_tiksoja_ride_19_cluster_split_1',
    'F:\\Processed_CSVs\\2023-11-03-11-14-53_tiksoja_ride_19_cluster_split_2',
    'F:\\Processed_CSVs\\2023-11-03-13-42-01_tiksoja_ride_20_sfa_split_1',
    'F:\\Processed_CSVs\\2023-11-03-13-56-06_tiksoja_ride_20_sfa_split_2'
]

# Parameters to exclude from additional files (columns containing these substrings will be removed)
excluded_parameters = ['header', 'component_type', 'child_frame']

expand_and_combine_with_unified_files(unified_files, corresponding_folders, excluded_parameters)


0 NaT values found after datetime conversion.
After conversion, 0 NaT values found in file: processed_detection-final_objects.csv
After conversion, 0 NaT values found in file: processed_planning-global_path.csv
After conversion, 0 NaT values found in file: processed_planning-local_path.csv
NaT values in combined DataFrame: 0
Duplicated timestamps: Index([], dtype='int64')
Duplicate rows:
Empty DataFrame
Columns: [Time, steer_cmd.steer, accel_cmd.accel, brake_cmd.brake, lamp_cmd.l, lamp_cmd.r, gear_cmd.gear, mode, twist_cmd.twist.linear.x, twist_cmd.twist.linear.y, twist_cmd.twist.linear.z, twist_cmd.twist.angular.x, twist_cmd.twist.angular.y, twist_cmd.twist.angular.z, ctrl_cmd.linear_velocity, ctrl_cmd.linear_acceleration, ctrl_cmd.steering_angle, emergency, pose.position.x, pose.position.y, pose.position.z, pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w, twist.linear.x, twist.linear.y, twist.linear.z, twist.angular.x, twist.angular.y, twist.angular.z, 