## Import Data

In [10]:
import pandas as pd
%run utils.ipynb
from datetime import datetime

In [11]:
dos_df,fuzzy_df,attack_free_df=load_data("out_paths",lib="pd")

In [12]:
only_dos_df=dos_df[dos_df["updated_flag"]=='T']
only_fuzzy_df=fuzzy_df[fuzzy_df["updated_flag"]=='T']

In [13]:
attack_free_inside_dos_df=dos_df[dos_df["updated_flag"]=='R']
attack_free_inside_fuzzy_df=fuzzy_df[fuzzy_df["updated_flag"]=='R']


## Preprocessing Steps

### Common Methods

In [14]:
def validate_column_in_dataframe(df, column_name):
    """
    Checks column exist or not in given df.

    Parameters
    ----------
    df :pl.DataFrame
        Input DataFrame.
    column_name : str
        Column name that will be checked.

    Raises
    ------
    ValueError
       If the specified column does not exist in the DataFrame.
    """

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

### Delete Data

#### Delete Noisy Data

- In the attack-free dataset inside the fuzzy dataset, the value 6 in the dlc column appears only three times out of 3 million records. Since it is noisy, it needs to be removed.

In [15]:
attack_free_inside_fuzzy_df[attack_free_inside_fuzzy_df["dlc"]==6]


Unnamed: 0,timestamp,can_id,dlc,byte_0,byte_1,byte_2,byte_3,byte_4,byte_5,byte_6,byte_7,updated_flag
1546675,1478197000.0,105,6,eb,1,b7,0,98,2,,,R
1713142,1478197000.0,105,6,ec,1,b8,0,be,1,,,R
1713159,1478197000.0,105,6,eb,1,b7,0,98,2,,,R


In [16]:
attack_free_inside_fuzzy_df=attack_free_inside_fuzzy_df[attack_free_inside_fuzzy_df["dlc"]!=6]

#### Delete Column

##### frame_type in attack_free_df

In [17]:
def drop_columns(df, columns):
    """
    Drop specified columns from DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame that columns will be dropped.
    columns : list
        list of column names to be dropped.

    Returns
    -------
    pd.DataFrame
        Modified DataFrame with the specified columns removed.
        
    Raises
    ------
    KeyError
        If any of the specified columns do not exist in the DataFrame.
    """
    validate_column_in_dataframe(df, columns)
    return df.drop(columns=columns)

In [18]:
attack_free_df=drop_columns(attack_free_df,"frame_type")

### Sample data

In [19]:
def do_random_sampling(df, sample_size):
    """_summary_

    Perform random sampling on a given DataFrame.
    ----------
    df : pd.DataFrame
        The input dataframe from which to sample data.
    sample_size : int
        The number of samples to extract

    Returns
    -------
    pd.DataFrame
        A randomly sampled DataFrame with 'sample_size' rows
    """
    return df.sample(n=sample_size,random_state=42)

In [20]:
def do_proportionate_stratified_sampling(df,column_name, sample_fraction):
    """_summary_

    Perform proportionate stratified sampling on a given DataFrame.

    This function samples a specified fraction of each unique category 
    in the given column, ensuring the original distribution is maintained.
    ----------
    df : _type_
        The input DataFrame containing data.
    column_name : _type_
        The name of column to use for stratified sampling.
    sample_fraction : _type_
        The fraction of data to sample from each category. (between 0 and 1)

    Returns
    -------
    pd.DataFrame
        A proportionately stratified sample of the input DataFrame.

    Raises
    ------
    ValueError
        If the sample_fraction is not between 0 and 1.
    """ 

    if not (0<sample_fraction<=1):
        raise ValueError("sample_fraction must be between 0 and 1")
    
    #group by creates sub-dataframes for each unique value in the column
    #apply allows us to apply a function to each of these sub-dataframes
    #lambda applies sample to each sub-dataframe
    
    return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))


In [21]:
sampled_dos_df=do_random_sampling(only_dos_df, 40000)
sampled_fuzzy_df=do_random_sampling(only_fuzzy_df, 40000)

In [22]:
sampled_attack_free_df=do_proportionate_stratified_sampling(attack_free_df,"dlc",0.02)
sampled_attack_free_inside_dos_df=do_proportionate_stratified_sampling(attack_free_inside_dos_df,"dlc",0.003)
sampled_attack_free_inside_fuzzy_df=do_proportionate_stratified_sampling(attack_free_inside_fuzzy_df,"dlc",0.003)

print(sampled_attack_free_df.shape)
print(sampled_attack_free_inside_dos_df.shape)
print(sampled_attack_free_inside_fuzzy_df.shape)


  return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))
  return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))


(19778, 11)
(9235, 12)
(10041, 12)


  return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))


### Sort data

In [23]:
def sort_df_by_column(df,column_name):
    """

    Sort the given DataFrame by the values in the specified column.
    ----------
    df : pd.DataFrame
        The input DataFrame to sort.
    column_name : str
        The name of the column to use for sorting.

    Returns
    -------
    pd.DataFrame
        The input DataFrame sorted by the values in the specified column.
    
    Raises
    ------
    ValueError
        If the column name is not found in the DataFrame.
    """
    validate_column_in_dataframe(df,column_name)
    return df.sort_values(by=column_name,ascending=True)

In [24]:
def sort_multiple_dfs_by_column(dfs,column_name):
    """
    Sort multiple DataFrames by the values in the specified column.
    ----------      

    Parameters
    ----------
    dfs : list
        List of DataFrames to sort.
    column_name : str
        The name of the column to use for sorting.

    Returns
    -------
    list
        List of DataFrames sorted by the values in the specified column.

    Raises
    ------
    ValueError
        If the column name is not found in any of the DataFrames.
    """
    sorted_dfs=[]
    for df in dfs:
        validate_column_in_dataframe(df,column_name)
        sorted_dfs.append(df.sort_values(by=column_name,ascending=True))
    return sorted_dfs


In [25]:
df_list=[sampled_dos_df,sampled_fuzzy_df,sampled_attack_free_df,sampled_attack_free_inside_dos_df,sampled_attack_free_inside_fuzzy_df]
column_name="timestamp"
sorted_dfs= sort_multiple_dfs_by_column(df_list,column_name)


In [26]:
sorted_dos_df,sorted_fuzzy_df,sorted_attack_free_df,sorted_attack_free_inside_dos_df,sorted_attack_free_inside_fuzzy_df=sorted_dfs

### Insert columns

#### updated_flag into attack free df

In [27]:
def insert_new_column(df,new_column_name):
    """
    Insert new column into a DataFrame, initializing with missing values(pd.NA)

    Parameters
    ----------
    df :pd.DataFrame
        The input DataFrame that new colum will be added.
    new_column_name : str
        The name of the new column to add to the DataFrame.


    Returns
    -------
    pd.DataFrame
        A DataFrame with the new column added.
    """
    df[new_column_name]=pd.NA
    return df


In [28]:
inserted_attack_free_df=insert_new_column(sorted_attack_free_df,"updated_flag")

In [29]:
inserted_attack_free_df.head()

Unnamed: 0,timestamp,can_id,dlc,byte_0,byte_1,byte_2,byte_3,byte_4,byte_5,byte_6,byte_7,updated_flag
32,1479121000.0,0329,8,87,b9,7e,14,12,20,00,14,
46,1479121000.0,018f,8,fe,36,00,00,00,3c,00,0,
209,1479121000.0,0130,8,06,80,00,ff,0b,80,0a,44,
294,1479121000.0,02c0,8,14,00,00,00,00,00,00,0,
387,1479121000.0,0545,8,d8,00,00,8a,00,00,00,0,


#### attack_type column into all columns

In [30]:
inserted_dos_df,inserted_fuzzy_df,inserted_attack_free_df,inserted_attack_free_inside_dos_df,inserted_attack_free_inside_fuzzy_df=sorted_dos_df,sorted_fuzzy_df,inserted_attack_free_df,sorted_attack_free_inside_dos_df,sorted_attack_free_inside_fuzzy_df
inserted_dfs=inserted_dos_df,inserted_fuzzy_df,inserted_attack_free_df,inserted_attack_free_inside_dos_df,inserted_attack_free_inside_fuzzy_df

In [31]:
new_column_name="attack_type"
for df in inserted_dfs:
    insert_new_column(df,new_column_name)
    

### Fill columns

In [32]:
def fill_column_with_value(df,column_name,value):
    """
    Fills a specified column in a DataFrame with a given value.
    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame that column will be updated.
    column_name : str
        The name of column to be filled.
    value : any
        The value to fill column with.

    Returns
    -------
    pd.DataFrame
       The modified DataFrame with the specified column updated.
    """
    df[column_name]=value
    return df

#### updated_flag value of attack free df

In [33]:
filled_attack_free_df=fill_column_with_value(inserted_attack_free_df,"updated_flag",'R')

In [34]:
filled_attack_free_df.head()

Unnamed: 0,timestamp,can_id,dlc,byte_0,byte_1,byte_2,byte_3,byte_4,byte_5,byte_6,byte_7,updated_flag,attack_type
32,1479121000.0,0329,8,87,b9,7e,14,12,20,00,14,R,
46,1479121000.0,018f,8,fe,36,00,00,00,3c,00,0,R,
209,1479121000.0,0130,8,06,80,00,ff,0b,80,0a,44,R,
294,1479121000.0,02c0,8,14,00,00,00,00,00,00,0,R,
387,1479121000.0,0545,8,d8,00,00,8a,00,00,00,0,R,


#### attack_type columns in all dfs

In [35]:
filled_dos_df,filled_fuzzy_df,filled_attack_free_df,filled_attack_free_inside_dos_df,filled_attack_free_inside_fuzzy_df=inserted_dos_df,inserted_fuzzy_df,filled_attack_free_df,inserted_attack_free_inside_dos_df,inserted_attack_free_inside_fuzzy_df
filled_dfs=filled_dos_df,filled_fuzzy_df,filled_attack_free_df,filled_attack_free_inside_dos_df,filled_attack_free_inside_fuzzy_df
    

In [36]:
for df in [filled_attack_free_df,filled_attack_free_inside_dos_df,filled_attack_free_inside_fuzzy_df]:
    fill_column_with_value(df,"attack_type",0)

In [37]:
filled_dos_df=fill_column_with_value(filled_dos_df,"attack_type",1)
filled_fuzzy_df=fill_column_with_value(filled_fuzzy_df,"attack_type",2)

In [38]:
filled_dfs=filled_dos_df,filled_fuzzy_df,filled_attack_free_df,filled_attack_free_inside_dos_df,filled_attack_free_inside_fuzzy_df


### Rename data types

#### updated_flag -> flag

In [39]:
def rename_column(df, column_name, new_column_name):
    """
    Renames a specified column in a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame in which the column will be renamed.
    column_name : str
        The current name of the column to be renamed.
    new_column_name : str
        The new name for the column.

    Returns
    -------
    pandas.DataFrame
        The DataFrame with the specified column renamed.

    Raises
    ------
    KeyError
        If the specified column does not exist in the DataFrame.
    """
    validate_column_in_dataframe(df, column_name)
    return df.rename(columns={column_name: new_column_name})


In [40]:
def rename_multiple_dfs_columns(dfs,column_name,new_column_name):
    """_summary_

    Parameters
    ----------
    dfs : _type_
        _description_
    column_name : _type_
        _description_
    new_column_name : _type_
        _description_

    Returns
    -------
    _type_
        _description_
    """
    renamed_dfs=[]
    for df in dfs:
        renamed_dfs.append(rename_column(df,column_name,new_column_name))
    return renamed_dfs

In [41]:
renamed_dfs=rename_multiple_dfs_columns(filled_dfs,"updated_flag","flag")

### Convert data types

#### timestamp->datetime

- We convert timestamp to datetime for data visualization as a next step!

In [42]:
def convert_column_timestamp_to_datetime(df,column_name,new_column_name):
    #ten digit timestamp suggessts seconds since epoch
    """
    Convert a Unix timestamp to a datetime object and add it as a new column to the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing the timestamp column.
    column_name : str
        The name of the column containing the Unix timestamp.
    new_column_name : str
        The name of the new column to add to the DataFrame.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the new column added.
    """
 
    validate_column_in_dataframe(df,column_name)
    
    df[new_column_name]=pd.to_datetime(df[column_name],unit='s')
    return df

In [43]:
def convert_multiple_dfs_timestamp_to_datetime(dfs,column_name,new_column_name):
    """
    Convert a Unix timestamp to a datetime object and add it as a new column to each DataFrame in the list.

    Parameters
    ----------
    dfs : list
        List of DataFrames containing the timestamp column.
    column_name : str
        The name of the column containing the Unix timestamp.
    new_column_name : str
        The name of the new column to add to the DataFrame.

    Returns
    -------
    list
        List of DataFrames with the new column added.
    """
    converted_dfs=[]
    for df in dfs:
        converted_dfs.append(convert_column_timestamp_to_datetime(df,column_name,new_column_name))
    return converted_dfs

In [44]:
converted_timestamp_dfs=convert_multiple_dfs_timestamp_to_datetime(renamed_dfs,"timestamp","datetime")

#### can_id hex(str) ->  can_id (int) 

In [45]:
def convert_str_hex_to_int(df,column_name,new_column_name):
    """
    Convert a hexadecimal string to an integer and add it as a new column to the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing the hexadecimal column.
    column_name : str
        The name of the column containing the hexadecimal string.
    new_column_name : str
        The name of the new column to add to the DataFrame.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the new column added.
    """
    validate_column_in_dataframe(df,column_name)
    df[new_column_name]=df[column_name].apply(lambda x: int(x,16))
    return df

In [46]:
def convert_multiple_dfs_str_hex_to_int(dfs,column_name,new_column_name):
    """
    Convert a hexadecimal string to an integer and add it as a new column to each DataFrame in the list.

    Parameters
    ----------
    dfs : list
        List of DataFrames containing the hexadecimal column.
    column_name : str
        The name of the column containing the hexadecimal string.
    new_column_name : str
        The name of the new column to add to the DataFrame.

    Returns
    -------
    list
        List of DataFrames with the new column added.
    """
    converted_dfs=[]
    for df in dfs:
        converted_dfs.append(convert_str_hex_to_int(df,column_name,new_column_name))
    return converted_dfs

In [47]:
converted_can_id_dfs=convert_multiple_dfs_str_hex_to_int(converted_timestamp_dfs,"can_id","can_id")

In [48]:
converted_can_id_dfs[0].head()

Unnamed: 0,timestamp,can_id,dlc,byte_0,byte_1,byte_2,byte_3,byte_4,byte_5,byte_6,byte_7,flag,attack_type,datetime
1479,1478198000.0,0,8,0,0,0,0,0,0,0,0,T,1,2016-11-03 18:39:37.186119080
1487,1478198000.0,0,8,0,0,0,0,0,0,0,0,T,1,2016-11-03 18:39:37.188112974
1517,1478198000.0,0,8,0,0,0,0,0,0,0,0,T,1,2016-11-03 18:39:37.195993900
1529,1478198000.0,0,8,0,0,0,0,0,0,0,0,T,1,2016-11-03 18:39:37.199111938
1569,1478198000.0,0,8,0,0,0,0,0,0,0,0,T,1,2016-11-03 18:39:37.219604969


In [49]:
byte_columns=["byte_0","byte_1","byte_2","byte_3","byte_4","byte_5","byte_6","byte_7"]

#### byte_0 through byte_7 (str hex)→(int)