In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
# Function to process the String to correct format - removing unwanted characters
def parse_objects_data_preprocessing(parsed_object, target):
    if parsed_object.startswith('['):
        parsed_object = parsed_object[1:]
    if (target == 'objects' and parsed_object.endswith(']]')) or (target == 'waypoints' and parsed_object.endswith(']')):
        parsed_object = parsed_object[:-1]
    return parsed_object.strip()

In [3]:
# Function to parse Strings to correct types
def parse_value(value):
    value = value.strip()
    if re.match(r'^\d+$', value):  # Integer
        return int(value)
    elif re.match(r'^\d+\.\d+$', value):  # Float
        return float(value)
    elif value in ['True', 'False']:  # Boolean
        return value == 'True'
    return value.strip('"')  # String

In [4]:
# Function to recursively parse the lines of tab-separated key-value pairs into a nested dictionary
def recursive_parse(lines, level=0):
    obj = {}
    
    while lines:
        line = lines.pop(0)
        tabs = len(re.match(r'^\t*', line).group())
        
        # If the indentation level decreases, stop processing
        if tabs < level:
            lines.insert(0, line)
            break
        
        parts = line.strip().split(":", 1)
        key = parts[0].strip()
        value = parts[1].strip() if len(parts) > 1 else ""
        
        if not value:  # If value is empty, treat as nested object
            obj[key] = recursive_parse(lines, level=tabs + 1)
        else:  # Parse key-value pair
            parsed_value = parse_value(value)
            obj[key] = parsed_value
    
    return obj

In [5]:
# Function to parse the String value into separate objects
def parse_objects_column(objects_data):
    # Values are separated by commas
    objects_raw = objects_data.split(",")
    parsed_objects = []

    for obj_data in objects_raw:
        # Formatting data by replacing spaces with tabs
        formatted_data = re.sub(r' {4}', '\t\t', obj_data.strip())
        formatted_data = re.sub(r' {2}', '\t', formatted_data)
        lines = formatted_data.splitlines()
        parsed_object = recursive_parse(lines)
        parsed_objects.append(parsed_object)
    
    return parsed_objects

In [6]:

def process_csv(file_path, target, desired_features, output_path):
        
        df = pd.read_csv(file_path)

        expanded_rows = []

        for _, row in df.iterrows():
            # Preprocess the target column (either 'objects' or 'waypoints')
            target_raw = parse_objects_data_preprocessing(row[target], target)

            try:
                # Parse the nested data
                parsed_objects = parse_objects_column(target_raw)

                if not isinstance(parsed_objects, list):
                    parsed_objects = [parsed_objects]

                # Create a new row for the parsed objects
                new_row = row.drop(labels=[target]).to_dict()

                # Finding the desired features and creating new columns for each object, based on the index
                for i, obj in enumerate(parsed_objects):
                    for feature, json_path in desired_features.items():
                        keys = json_path.split('.')
                        value = obj
                        for key in keys:
                            value = value.get(key, None) if isinstance(value, dict) else None
                        new_row[f"{feature}_{i+1}"] = value

                expanded_rows.append(new_row)

            except Exception as e:
                print(f"Error parsing row in {file_path}: {e}")
                continue

        # Create a new DataFrame
        expanded_df = pd.DataFrame(expanded_rows)

        # Add missing columns for uniformity (if there are fewer objects than the maximum)
        max_objects = max(
            sum(1 for col in expanded_df.columns if col.startswith(feature))
            for feature in desired_features.keys()
        )
        for feature in desired_features.keys():
            for i in range(1, max_objects + 1):
                column_name = f"{feature}_{i}"
                if column_name not in expanded_df.columns:
                    expanded_df[column_name] = np.nan

        # Save to output file
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        expanded_df.to_csv(output_path, index=False)
        print(f"Processed CSV saved to {output_path}")

In [7]:
# Defining the target column for each file type
targets = {
    'detection-final_objects.csv': 'objects',
    'planning-global_path.csv': 'waypoints',
    'planning-local_path.csv': 'waypoints'
}

In [8]:
# Defining the desired features for each target type
desired_features_mapping = {
    'objects': {
        'label': 'label',
        'position_x': 'pose.position.x',
        'position_y': 'pose.position.y',
        'position_z': 'pose.position.z',
        'orientation_x': 'pose.orientation.x',
        'orientation_y': 'pose.orientation.y',
        'orientation_z': 'pose.orientation.z',
        'orientation_w': 'pose.orientation.w',
        'dimensions_x': 'dimensions.x',
        'dimensions_y': 'dimensions.y',
        'dimensions_z': 'dimensions.z',
    },
    'waypoints': {
        'position_x': 'pose.pose.position.x',
        'position_y': 'pose.pose.position.y',
        'position_z': 'pose.pose.position.z',
        'orientation_x': 'pose.pose.orientation.x',
        'orientation_y': 'pose.pose.orientation.y',
        'orientation_z': 'pose.pose.orientation.z',
        'orientation_w': 'pose.pose.orientation.w',
        'twist_linear_x': 'twist.twist.linear.x',
        'twist_linear_y': 'twist.twist.linear.y',
        'twist_linear_z': 'twist.twist.linear.z',
        'twist_angular_x': 'twist.twist.angular.x',
        'twist_angular_y': 'twist.twist.angular.y',
        'twist_angular_z': 'twist.twist.angular.z',
        'dtlane_lw': 'dtlane.lw',
        'dtlane_rw': 'dtlane.rw'
    }
}

In [27]:
# Defining folders we want to process
input_folders = [
    'F:\\2023-10-16-10-14-49_tiksoja_ride_02_sfa',
    'F:\\2023-10-16-10-30-17_tiksoja_ride_03_cluster',
    'F:\\2023-10-16-10-50-24_tiksoja_ride_04_sfa',
    'F:\\2023-10-16-13-47-19_tiksoja_ride_05_cluster',
    'F:\\2023-10-16-15-23-20_tiksoja_ride_08_cluster',
    'F:\\2023-10-30-09-55-20_tiksoja_ride_09_sfa_split_2',
    'F:\\2023-10-30-10-21-51_tiksoja_ride_10_cluster_split_1',
    'F:\\2023-10-30-10-34-58_tiksoja_ride_10_cluster_split_2',
    'F:\\2023-10-30-10-53-17_tiksoja_ride_11_cluster_split_1',
    'F:\\2023-10-30-11-07-11_tiksoja_ride_11_cluster_split_2',
    'F:\\2023-10-30-14-28-38_tiksoja_ride_12_sfa_split_1',
    'F:\\2023-10-30-15-04-36_tiksoja_ride_13_cluster_split_1',
    'F:\\2023-10-30-15-19-32_tiksoja_ride_13_cluster_split_2',
    'F:\\2023-10-31-09-57-58_tiksoja_ride_14_sfa_split_1',
    'F:\\2023-10-31-10-17-12_tiksoja_ride_14_sfa_split_2',
    'F:\\2023-10-31-10-41-16_tiksoja_ride_15_cluster_split_1',
    'F:\\2023-10-31-10-59-27_tiksoja_ride_15_cluster_split_2',
    'F:\\2023-11-02-11-47-11_tiksoja_ride_16_sfa_split_1',
    'F:\\2023-11-02-12-00-24_tiksoja_ride_16_sfa_split_2',
    'F:\\2023-11-02-12-44-53_tiksoja_ride_17_cluster_split_1',
    'F:\\2023-11-02-12-58-32_tiksoja_ride_17_cluster_split_2',
    'F:\\2023-11-03-09-57-03_tiksoja_ride_18_sfa_split_1',
    'F:\\2023-11-03-10-21-22_tiksoja_ride_18_sfa_split_2',
    'F:\\2023-11-03-10-59-56_tiksoja_ride_19_cluster_split_1',
    'F:\\2023-11-03-11-14-53_tiksoja_ride_19_cluster_split_2',
    'F:\\2023-11-03-13-42-01_tiksoja_ride_20_sfa_split_1',
    'F:\\2023-11-03-13-56-06_tiksoja_ride_20_sfa_split_2'
]

In [19]:
# Defingin the files we want to process
file_list = [
    'detection-final_objects.csv',
    'planning-global_path.csv',
    'planning-local_path.csv'
]

In [20]:
# Destination folder for processed CSVs
output_folder = "F:\\Processed_CSVs"

In [35]:
# Iterating over the folders and files and creating processed CSVs
for folder in input_folders:
    for file_name in file_list:
        file_path = os.path.join(folder, file_name)
        if os.path.exists(file_path):
            # The target based on the file name
            target = targets[file_name]
            # Desired features based on the target
            desired_features = desired_features_mapping[target]
            output_path = os.path.join(output_folder, os.path.basename(folder), f"processed_{file_name}")
            process_csv(file_path, target, desired_features, output_path)
        else:
            print(f"File not found: {file_path}")

F:\2023-10-16-10-14-49_tiksoja_ride_02_sfa\detection-final_objects.csv
Processed CSV saved to F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_detection-final_objects.csv
F:\2023-10-16-10-14-49_tiksoja_ride_02_sfa\planning-global_path.csv
Processed CSV saved to F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_planning-global_path.csv
F:\2023-10-16-10-14-49_tiksoja_ride_02_sfa\planning-local_path.csv
Processed CSV saved to F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_planning-local_path.csv


In [35]:
import os
import pandas as pd
from collections import defaultdict

# Define the root directory containing the folders
root_directory = "F:\\Processed_CSVs"

# Initialize a dictionary to store the column count overview
column_count_overview = {
    "File 1": defaultdict(int),
    "File 2": defaultdict(int),
    "File 3": defaultdict(int),
}

# Iterate over all folders in the root directory
for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    if not os.path.isdir(folder_path):
        continue

    # Iterate over all files in the folder (assume fixed order File 1, File 2, File 3)
    for i, file_name in enumerate(sorted(os.listdir(folder_path))):
        file_path = os.path.join(folder_path, file_name)

        try:
            # Read the CSV file
            df = pd.read_csv(file_path, low_memory=False)

            # Get the last column name and extract the number from the format {feature}_{i}
            last_column = df.columns[-1]
            try:
                column_index = int(last_column.split("_")[-1])
                column_count_overview[f"File {i + 1}"][column_index] += 1
            except ValueError:
                print(f"Invalid format for the last column in {file_path}: {last_column}")

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")


File 1: {27: 1, 192: 1, 13: 1, 201: 1, 196: 1, 33: 1, 208: 2, 207: 1, 199: 2, 43: 1, 179: 1, 200: 1, 28: 1, 49: 1, 197: 1, 330: 1, 34: 1, 44: 1, 206: 1, 202: 1, 42: 2, 45: 1, 190: 1, 24: 1}
File 2: {670: 3, 585: 2, 645: 1, 315: 1, 817: 1, 765: 1, 686: 1, 661: 1, 763: 3, 909: 1, 1082: 1, 1191: 1, 925: 1, 1085: 1, 646: 1, 667: 1, 741: 2, 1228: 1, 1309: 1, 908: 1, 635: 1}
File 3: {100: 27}


In [36]:
# Print the summary overview with the minimum key
for file_type, counts in column_count_overview.items():
    counts_dict = dict(counts)
    min_key = min(counts_dict.keys()) if counts_dict else None
    print(f"{file_type}: {counts_dict}, Minimum key: {min_key}")


File 1: {27: 1, 192: 1, 13: 1, 201: 1, 196: 1, 33: 1, 208: 2, 207: 1, 199: 2, 43: 1, 179: 1, 200: 1, 28: 1, 49: 1, 197: 1, 330: 1, 34: 1, 44: 1, 206: 1, 202: 1, 42: 2, 45: 1, 190: 1, 24: 1}, Minimum key: 13
File 2: {670: 3, 585: 2, 645: 1, 315: 1, 817: 1, 765: 1, 686: 1, 661: 1, 763: 3, 909: 1, 1082: 1, 1191: 1, 925: 1, 1085: 1, 646: 1, 667: 1, 741: 2, 1228: 1, 1309: 1, 908: 1, 635: 1}, Minimum key: 315
File 3: {100: 27}, Minimum key: 100


In [3]:
import os
import pandas as pd
from collections import defaultdict

# Define the input folder
processed_folder = "F:\\Processed_CSVs"

# Initialize a dictionary to store results
filled_columns_overview = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

# Dictionary to track overall minimums for each file type
overall_minimums = defaultdict(lambda: float('inf'))

# Process each folder
for folder in os.listdir(processed_folder):
    folder_path = os.path.join(processed_folder, folder)
    if os.path.isdir(folder_path):
        for idx, file_name in enumerate(sorted(os.listdir(folder_path))):  # Ensure consistent file order
            file_path = os.path.join(folder_path, file_name)
            if file_name.endswith(".csv"):
                try:
                    # Read the file
                    df = pd.read_csv(file_path, low_memory=False)

                    # Initialize a dictionary for filled column counts in this file
                    filled_columns_count = defaultdict(int)

                    for _, row in df.iterrows():
                        # Find the last non-NaN value's column
                        last_filled = row.last_valid_index()

                        if last_filled:
                            # Extract the last column index {feature}_{i}
                            if "_" in last_filled:
                                i = last_filled.split("_")[-1]  # Get the last part after the last underscore
                                if i.isdigit():  # Ensure it's a number
                                    filled_columns_count[int(i)] += 1
                                #else:
                                    #filled_columns_count[0] += 1  # Add key 0 for rows with no valid numbers
                            #else:
                                #filled_columns_count[0] += 1  # Add key 0 for rows with no underscore
                        #else:
                            #filled_columns_count[0] += 1  # Add key 0 for completely empty rows

                    # Save results for the current file
                    filled_columns_overview[folder][f"File {idx + 1}"] = dict(filled_columns_count)

                    # Update overall minimum for this file type
                    if filled_columns_count:
                        file_min = min(filled_columns_count.keys())
                        overall_minimums[f"File {idx + 1}"] = min(overall_minimums[f"File {idx + 1}"], file_min)

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# Print results
for folder, files in filled_columns_overview.items():
    print(f"Folder: {folder}")
    for file, counts in files.items():
        file_min = min(counts.keys()) if counts else None
        print(f"  {file}: {counts} (Minimum: {file_min})")

print("\nOverall Minimums Across All Folders:")
for file_type, min_value in overall_minimums.items():
    print(f"  {file_type}: {min_value}")


Folder: 2023-10-16-10-14-49_tiksoja_ride_02_sfa
  File 1: {9: 128, 11: 102, 12: 79, 10: 137, 13: 76, 15: 76, 16: 91, 14: 80, 17: 141, 18: 156, 20: 85, 21: 70, 19: 129, 24: 34, 26: 13, 25: 19, 23: 25, 22: 37, 27: 9, 7: 135, 8: 119, 6: 229, 4: 400, 5: 319, 3: 650, 2: 822, 1: 1345} (Minimum: 1)
  File 2: {77: 1, 670: 1} (Minimum: 77)
  File 3: {99: 4, 98: 2, 97: 2, 96: 4, 95: 2, 94: 3, 93: 3, 92: 2, 91: 3, 90: 4, 89: 2, 88: 3, 87: 4, 86: 3, 85: 4, 84: 4, 83: 4, 82: 4, 81: 5, 80: 5, 79: 6, 78: 6, 77: 7, 76: 9, 75: 10, 74: 69, 73: 9, 72: 5, 71: 6, 70: 4, 69: 3, 68: 5, 67: 3, 66: 3, 65: 4, 64: 4, 63: 3, 62: 3, 61: 4, 60: 3, 59: 4, 58: 4, 57: 3, 56: 3, 55: 4, 54: 4, 53: 3, 52: 5, 51: 4, 50: 3, 49: 5, 48: 4, 47: 4, 46: 5, 45: 4, 44: 4, 43: 5, 42: 4, 41: 5, 40: 3, 39: 4, 38: 5, 37: 5, 36: 4, 35: 5, 34: 4, 33: 4, 32: 5, 31: 4, 30: 5, 29: 4, 28: 5, 27: 5, 26: 5, 25: 4, 24: 5, 23: 5, 22: 5, 21: 5, 20: 5, 19: 5, 18: 6, 17: 4, 16: 5, 15: 6, 14: 4, 13: 6, 12: 6, 11: 6, 10: 7, 9: 7, 8: 9, 7: 11, 6: 16

In [None]:
# Folder where the processed CSVs are stored
processed_folder = "F:\\Processed_CSVs"

# Helper function to rename columns matching the pattern {feature}_{i}
def rename_columns(df, prefix):
    def rename_column(col):
        parts = col.split('_')  # Split column name by underscores
        if parts[-1].isdigit():  # Check if the last part is a digit
            return f"{prefix}_{col}"  # Rename column with the prefix
        return col

    return df.rename(columns=rename_column)

# Helper function to 'throw out' rows where all columns starting with a prefix are NaN - meaning if there are no detected objects or waypoints for a timestamp, we can remove that row
def filter_by_last_filled(df, prefix):
    # Identify all columns that start with the prefix and end with a digit
    pattern_columns = [col for col in df.columns if col.startswith(prefix) and col.split('_')[-1].isdigit()]

    if not pattern_columns:
        raise ValueError(f"No columns starting with '{prefix}' and ending with a digit found.")

    def has_valid_column(row):
        # Check if any of the identified columns are filled (not NaN) in this row
        return row[pattern_columns].notna().any()

    # Use the filter function to retain rows where at least one relevant column is filled
    valid_mask = df.apply(has_valid_column, axis=1)
    return df[valid_mask]


# Helper function to truncate columns to a specific limit
def truncate_columns(df, prefix, limit):
    def should_keep_column(col):
        # Check if the column starts with the prefix and ends with a digit
        if col.startswith(prefix) and col.split("_")[-1].isdigit():
            # Keep the column if the last digit is less than or equal to the limit
            return int(col.split("_")[-1]) <= limit
        # Keep columns that do not end with a digit
        return True

    # Filter columns based on the `should_keep_column` function
    columns_to_keep = [col for col in df.columns if should_keep_column(col)]
    return df[columns_to_keep]


# Initialize a global mapping for all labels
global_label_mapping = {}

def categorize_labels(df):
    global global_label_mapping  # Access the global mapping
    
    for col in df.columns:
        # Check if "label" appears between underscores in the column name
        if re.search(r"_label_", col):
            # Extract unique values from the column
            unique_values = df[col].dropna().unique()
            
            # Add new labels to the global mapping if not already present
            for value in unique_values:
                if value not in global_label_mapping:
                    global_label_mapping[value] = len(global_label_mapping)

            # Map the column values to consistent codes using the global mapping
            df[col] = df[col].map(global_label_mapping).fillna(-1).astype(int)
    
    return df



# Main processing logic
# Iterate over all folders in the processed folder and all csv files within each folder
for folder_name in os.listdir(processed_folder):
    folder_path = os.path.join(processed_folder, folder_name)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            if not file_name.endswith(".csv"):
                continue

            try:
                # Read the CSV file
                df = pd.read_csv(file_path, low_memory=False)

                if file_name == "processed_detection-final_objects.csv":
                    df = rename_columns(df, "objects") # Rename the columns with the prefix "objects"
                    df = filter_by_last_filled(df, "objects") # Filter out rows where all object columns are NaN
                    df = truncate_columns(df, "objects", 10) # Truncate the object columns to a maximum of 10
                    df = categorize_labels(df) # Categorize the labels and map them to consistent codes
                    df.fillna(-999, inplace=True) # Fill NaN values with a specific value
                
                elif file_name == "processed_planning-global_path.csv":
                    df = rename_columns(df, "global") # Rename the columns with the prefix "global"
                    df = filter_by_last_filled(df, "global") # Filter out rows where all global columns are NaN
                    df = truncate_columns(df, "global", 10) # Truncate the global columns to a maximum of 10
                    df.fillna(-999, inplace=True) # Fill NaN values with a specific value

                elif file_name == "processed_planning-local_path.csv":
                    df = rename_columns(df, "local") # Rename the columns with the prefix "local"
                    df = filter_by_last_filled(df, "local") # Filter out rows where all local columns are NaN
                    df = truncate_columns(df, "local", 10) # Truncate the local columns to a maximum of 10
                    df.fillna(-999, inplace=True) # Fill NaN values with a specific value
                
                # Save the processed file back to the same location (or modify as needed)
                df.to_csv(file_path, index=False)
                print(f"Processed file: {file_path}")

            except Exception as e:
                print(f"Error processing file {file_path}: {e}")


F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_detection-final_objects.csv
Processed file: F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_detection-final_objects.csv
F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_planning-global_path.csv
Processed file: F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_planning-global_path.csv
F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_planning-local_path.csv
Processed file: F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\processed_planning-local_path.csv
F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\2023-10-16-10-14-49_tiksoja_ride_02_sfa_with_disengagements_disengagement_1.csv
Processed file: F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\2023-10-16-10-14-49_tiksoja_ride_02_sfa_with_disengagements_disengagement_1.csv
F:\Processed_CSVs\2023-10-16-10-14-49_tiksoja_ride_02_sfa\2023-10-16-10-14-49_tiksoja_ride_02_sfa_

In [37]:
print("Global Category Mapping:", global_label_mapping)


Global Category Mapping: {'car': 0, 'truck': 1, 'pedestrian': 2, 'bus': 3, 'bicyclist': 4, 'on_road_obstacle': 5, 'trailer': 6, 'motorcyclist': 7, 'traffic_cone': 8, 'animal': 9, 'stroller': 10, 'unknown': 11}
