In [8]:
#import libraries
import os
import pandas as pd
import numpy as np
from Data_Prep import *

In [9]:
# Defining the file path for CAN datasets
in_can = "/media/arupreza/Assets/Data/Dataset_Yoonji/CAN/"

In [10]:
# List all files in the directory and store them in a variable
def get_file_list(directory):
    try:
        files = sorted(os.listdir(directory))  # Sort the files for consistent indexing
        print(f"Files in '{directory}':")
        for i, file in enumerate(files):
            print(f"{i + 1}: {file}")  # Display index for each file
        return files
    except FileNotFoundError:
        print(f"The directory '{directory}' does not exist.")
        return []
    except PermissionError:
        print(f"Permission denied for accessing the directory '{directory}'.")
        return []


In [11]:
# Main workflow
file_list = get_file_list(in_can)  # Get all filenames

# Prepend the directory path to the file name
file_path = os.path.join(in_can, file_list[0])  # Full file path

Files in '/media/arupreza/Assets/Data/Dataset_Yoonji/CAN/':
1: normal_CAN.trc
2: normal_CAN_30(1).trc
3: normal_CAN_30(10).trc
4: normal_CAN_30(2).trc
5: normal_CAN_30(3).trc
6: normal_CAN_30(4).trc
7: normal_CAN_30(5).trc
8: normal_CAN_30(6).trc
9: normal_CAN_30(7).trc
10: normal_CAN_30(8).trc
11: normal_CAN_30(9).trc
12: replay_attack1_CAN_60(1).trc
13: replay_attack1_CAN_60(2).trc
14: replay_attack1_CAN_60(3).trc
15: replay_attack1_CAN_60(4).trc
16: replay_attack1_CAN_60(5).trc
17: replay_attack2_CAN_60(1).trc
18: replay_attack2_CAN_60(2).trc
19: replay_attack2_CAN_60(3).trc
20: replay_attack2_CAN_60(4).trc
21: replay_attack2_CAN_60(5).trc


In [25]:
cols = ["Time_Offset", "CAN_ID"] + [f'Payload_{i}' for i in range(1, 9)] + ["Time_Gap"]


In [29]:
def process_file(file_path, cols):
    """
    Processes a CAN or LIN file and returns a cleaned DataFrame with selected columns,
    replacing NaN values with -1.

    Args:
        file_path (str): Path to the file to be processed.
        cols (list): List of column names to keep in the resulting DataFrame.

    Returns:
        pd.DataFrame: Cleaned DataFrame with the specified columns and NaN replaced by -1.
    """
    # Convert the file into a DataFrame
    a = Convert_to_df(file_path)
    
    # Convert 'Time_Offset' to numeric
    a["Time_Offset"] = pd.to_numeric(a["Time_Offset"], errors="coerce")
    
    # Calculate the time gap
    a["Time_Gap"] = a["Time_Offset"].diff(1)
    
    # Reset the index
    a = a.reset_index(drop=True)
    
    # Select specified columns
    a = a[cols]
    
    # Replace NaN values with -1
    a = a.fillna(-1)
    
    return a

In [30]:
# Process the file
cleaned_data = process_file(file_path, cols)

In [31]:
cleaned_data

Unnamed: 0,Time_Offset,CAN_ID,Payload_1,Payload_2,Payload_3,Payload_4,Payload_5,Payload_6,Payload_7,Payload_8,Time_Gap
0,5.6,043F,01,45,60,FF,66,00,00,00,-1.0
1,5.9,0440,FF,91,00,00,FF,00,00,00,0.3
2,6.3,0316,05,20,02,0A,20,13,00,7F,0.4
3,6.5,0260,05,20,00,30,3C,8D,5F,0E,0.2
4,6.8,018F,00,2B,20,00,00,3F,00,10,0.3
...,...,...,...,...,...,...,...,...,...,...,...
634721,304261.5,043F,07,47,60,FF,6B,00,00,00,3.8
634722,304261.7,0370,FF,20,E0,80,FF,00,00,C0,0.2
634723,304262.0,0440,FF,4E,00,00,FF,00,00,00,0.3
634724,304262.2,04B0,00,00,00,80,00,40,00,80,0.2
