## Import data and libraries

In [1]:
import polars as pl
import datetime as dt
from omegaconf import OmegaConf
import os

In [2]:
notebooks_dir = os.getcwd()
base_dir = os.path.abspath(os.path.join(notebooks_dir, ".."))
config_path = os.path.join(base_dir, "config.yaml")
config = OmegaConf.load(config_path)


In [3]:
dos_df_path = os.path.join(base_dir, config.paths.dos_df)
fuzzy_df_path = os.path.join(base_dir, config.paths.fuzzy_df)
attack_free_df_path = os.path.join(base_dir, config.paths.attack_free_df)


In [4]:
dos_df = pl.read_csv(dos_df_path)
fuzzy_df = pl.read_csv(fuzzy_df_path)
attack_free_df = pl.read_csv(attack_free_df_path)
dfs = [dos_df, fuzzy_df, attack_free_df]

## Data Manipulation

### Convert Data Types

#### Timestamp

In [5]:
col="timestamp"

In [6]:
def convert_timestamp_to_datetime(df, new_column_name, existing_column_name):
    """
    Convert float timestamp column into datetime and add as new column.

    Parameters
    ----------
    data : pl.DataFrame
        Input DataFrame containing the timestamp column.
    new_column_name : str
        Name of new column to be added.
    existing_column_name : str
        Name of the existing column containing timestamps.

    Returns
    -------
    pl.DataFrame
        DataFrame with newly added datetime column.

    Raises
    ------
    ValueError
        If the existing column is not found in the DataFrame.
    """

    if existing_column_name not in df.columns:
        raise ValueError
    (f"Column '{existing_column_name}' not found in DataFrame.")

    return df.with_columns(
        pl.from_epoch(pl.col(existing_column_name), time_unit="s").alias(
            new_column_name
        )
    )

In [7]:
def convert_multiple_dfs_timestamp_to_datetime(
    dfs, new_column_name, existing_column_name
):
    """
    Convert multiple dfs' timestamp column into datetime column.

    Parameters
    ----------
    dfs : list
        List of DataFrame.
    new_column_name : str
        Name of new column to be added
    existing_column_name : str
        Name of the existing column containing timestamps.

    Returns
    -------
    list
        List of updated DataFrame.
    """
    return [
        convert_timestamp_to_datetime(df, new_column_name, 
                                      existing_column_name)
        for df in dfs
    ]

In [8]:
dfs = [dos_df, fuzzy_df, attack_free_df]
new_timestamp_column_name = "datetime"
existing_timestamp_column_name = "timestamp"

converted_dfs = convert_multiple_dfs_timestamp_to_datetime(
    dfs, new_timestamp_column_name, existing_timestamp_column_name
)
dos_df, fuzzy_df, attack_free_df = converted_dfs

#### Datetime

In [9]:
col="datetime"
dos_df[col].dtype

Datetime(time_unit='us', time_zone=None)

In [10]:
def divide_datetime_to_day_and_hour(df,existing_column_name,day,hour,):
    return df.with_columns(
        pl.col(existing_column_name).dt.day().alias(day),
        pl.col(existing_column_name).dt.hour().alias(hour),
        )


In [11]:
dos_df.columns

['timestamp',
 'canId',
 'dlc',
 'byte0',
 'byte1',
 'byte2',
 'byte3',
 'byte4',
 'byte5',
 'byte6',
 'byte7',
 'updatedFlag',
 'datetime']

#### CanID

In [12]:
def convert_str_hex_to_int(df,new_column_name,existing_column_name):
    """
    Convert hex that it's dtype is str into int column.

    Parameters
    ----------
    df : pl.DataFrame
        Input DataFrame containing the hex column.
    new_column_name : str
        Name of new column to be added.
    existing_column_name : str
        Name of the existing column containing hex.

    Returns
    -------
    pl.DataFrame
        DataFrame with newly added hex int column.

    Raises
    ------
    ValueError
        If the existing column is not found in the DataFrame.
    """

    if existing_column_name not in df.columns:
        raise ValueError
    (f"Column '{existing_column_name}' not found in DataFrame.")

    return df.with_columns(pl.col(existing_column_name).str.to_integer(base=16, strict=True).alias(new_column_name))

In [13]:
def convert_multiple_dfs_str_hex_canid_to_int(dfs, new_column_name, existing_column_name):
    """
    Convert multiple dfs' str hex column into int hex column.

    Parameters
    ----------
    dfs : list
        List of DataFrame.
    new_column_name : str
        Name of new column to be added
    existing_column_name : str
        Name of the existing column containing hex.

    Returns
    -------
    list
        List of updated DataFrame.
    """
    return [
        convert_str_hex_to_int(df,new_column_name,existing_column_name)
        for df in dfs
    ]


In [14]:
new_canid_column_name = "updatedCanIdInt"
existing_canid_column_name = "canId"

converted_dfs = convert_multiple_dfs_str_hex_canid_to_int(
    dfs, new_canid_column_name, existing_canid_column_name
)
dos_df, fuzzy_df, attack_free_df = converted_dfs

### Add New Features

#### Bytes into One Message

In [15]:
dos_df.head()

timestamp,canId,dlc,byte0,byte1,byte2,byte3,byte4,byte5,byte6,byte7,updatedFlag,updatedCanIdInt
f64,str,i64,str,str,str,str,str,str,str,str,str,i64
1478200000.0,"""018f""",8,"""fe""","""5b""","""00""","""00""","""00""","""3c""","""00""","""00""","""R""",399
1478200000.0,"""0260""",8,"""19""","""21""","""22""","""30""","""08""","""8e""","""6d""","""3a""","""R""",608
1478200000.0,"""02a0""",8,"""64""","""00""","""9a""","""1d""","""97""","""02""","""bd""","""00""","""R""",672
1478200000.0,"""0329""",8,"""40""","""bb""","""7f""","""14""","""11""","""20""","""00""","""14""","""R""",809
1478200000.0,"""0545""",8,"""d8""","""00""","""00""","""8a""","""00""","""00""","""00""","""00""","""R""",1349


In [16]:
dos_df.with_columns((pl.concat([dos_df[f"byte{i}"] for i in range(dos_df["dlc"].max())],how= 'vertical')).alias("UpdatedBytes"))


ShapeError: unable to add a column of length 29326160 to a DataFrame of height 3665770

In [None]:
dos_df.head()

timestamp,canId,dlc,byte0,byte1,byte2,byte3,byte4,byte5,byte6,byte7,updatedFlag,datetime,updatedCanIdInt
f64,str,i64,str,str,str,str,str,str,str,str,str,datetime[μs],i64
1478200000.0,"""018f""",8,"""fe""","""5b""","""00""","""00""","""00""","""3c""","""00""","""00""","""R""",2016-11-03 18:39:36,399
1478200000.0,"""0260""",8,"""19""","""21""","""22""","""30""","""08""","""8e""","""6d""","""3a""","""R""",2016-11-03 18:39:36,608
1478200000.0,"""02a0""",8,"""64""","""00""","""9a""","""1d""","""97""","""02""","""bd""","""00""","""R""",2016-11-03 18:39:36,672
1478200000.0,"""0329""",8,"""40""","""bb""","""7f""","""14""","""11""","""20""","""00""","""14""","""R""",2016-11-03 18:39:36,809
1478200000.0,"""0545""",8,"""d8""","""00""","""00""","""8a""","""00""","""00""","""00""","""00""","""R""",2016-11-03 18:39:36,1349
