In [1]:
import pandas as pd
%run utils.ipynb
from datetime import datetime

## Import Data

In [2]:
dos_df,fuzzy_df,attack_free_df=load_data("out_paths",lib="pd")

In [3]:
only_dos_df=dos_df[dos_df["updated_flag"]=='T']
only_fuzzy_df=fuzzy_df[fuzzy_df["updated_flag"]=='T']

In [4]:
attack_free_inside_dos_df=dos_df[dos_df["updated_flag"]=='R']
attack_free_inside_fuzzy_df=fuzzy_df[fuzzy_df["updated_flag"]=='R']


## Preprocessing Steps

### Common Methods

In [5]:
def validate_column_in_dataframe(df, column_name):
    """
    Checks column exist or not in given df.

    Parameters
    ----------
    df :pl.DataFrame
        Input DataFrame.
    column_name : str
        Column name that will be checked.

    Raises
    ------
    ValueError
       If the specified column does not exist in the DataFrame.
    """

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

### Delete Noisy Data

- In the attack-free dataset inside the fuzzy dataset, the value 6 in the dlc column appears only three times out of 3 million records. Since it is noisy, it needs to be removed.

In [6]:
attack_free_inside_fuzzy_df[attack_free_inside_fuzzy_df["dlc"]==6]


Unnamed: 0,timestamp,can_id,dlc,byte_0,byte_1,byte_2,byte_3,byte_4,byte_5,byte_6,byte_7,updated_flag
1546675,1478197000.0,105,6,eb,1,b7,0,98,2,,,R
1713142,1478197000.0,105,6,ec,1,b8,0,be,1,,,R
1713159,1478197000.0,105,6,eb,1,b7,0,98,2,,,R


In [7]:
attack_free_inside_fuzzy_df=attack_free_inside_fuzzy_df[attack_free_inside_fuzzy_df["dlc"]!=6]

### Sample data

In [8]:
def do_random_sampling(df, sample_size):
    """_summary_

    Perform random sampling on a given DataFrame.
    ----------
    df : pd.DataFrame
        The input dataframe from which to sample data.
    sample_size : int
        The number of samples to extract

    Returns
    -------
    pd.DataFrame
        A randomly sampled DataFrame with 'sample_size' rows
    """
    return df.sample(n=sample_size,random_state=42)

In [9]:
def do_proportionate_stratified_sampling(df,column_name, sample_fraction):
    """_summary_

    Perform proportionate stratified sampling on a given DataFrame.

    This function samples a specified fraction of each unique category 
    in the given column, ensuring the original distribution is maintained.
    ----------
    df : _type_
        The input DataFrame containing data.
    column_name : _type_
        The name of column to use for stratified sampling.
    sample_fraction : _type_
        The fraction of data to sample from each category. (between 0 and 1)

    Returns
    -------
    pd.DataFrame
        A proportionately stratified sample of the input DataFrame.

    Raises
    ------
    ValueError
        If the sample_fraction is not between 0 and 1.
    """ 

    if not (0<sample_fraction<=1):
        raise ValueError("sample_fraction must be between 0 and 1")
    
    #group by creates sub-dataframes for each unique value in the column
    #apply allows us to apply a function to each of these sub-dataframes
    #lambda applies sample to each sub-dataframe
    
    return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))


In [12]:
sampled_dos_df=do_random_sampling(only_dos_df, 40000)
sampled_fuzzy_df=do_random_sampling(only_fuzzy_df, 40000)

In [13]:
sampled_attack_free_df=do_proportionate_stratified_sampling(attack_free_df,"dlc",0.02)
sampled_attack_free_inside_dos_df=do_proportionate_stratified_sampling(attack_free_inside_dos_df,"dlc",0.003)
sampled_attack_free_inside_fuzzy_df=do_proportionate_stratified_sampling(attack_free_inside_fuzzy_df,"dlc",0.003)

print(sampled_attack_free_df.shape)
print(sampled_attack_free_inside_dos_df.shape)
print(sampled_attack_free_inside_fuzzy_df.shape)


  return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))
  return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))


(19778, 12)
(9235, 12)
(10041, 12)


  return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(frac=sample_fraction,random_state=42))


### Sort data

In [None]:
def sort_df_by_column(df,column_name):
    """

    Sort the given DataFrame by the values in the specified column.
    ----------
    df : pd.DataFrame
        The input DataFrame to sort.
    column_name : str
        The name of the column to use for sorting.

    Returns
    -------
    pd.DataFrame
        The input DataFrame sorted by the values in the specified column.
    
    Raises
    ------
    ValueError
        If the column name is not found in the DataFrame.
    """
    validate_column_in_dataframe(df,column_name)
    return df.sort_values(by=column_name,ascending=True)

In [None]:
def sort_multiple_dfs_by_column(dfs,column_name):
    """
    Sort multiple DataFrames by the values in the specified column.
    ----------      

    Parameters
    ----------
    dfs : list
        List of DataFrames to sort.
    column_name : str
        The name of the column to use for sorting.

    Returns
    -------
    list
        List of DataFrames sorted by the values in the specified column.

    Raises
    ------
    ValueError
        If the column name is not found in any of the DataFrames.
    """
    sorted_dfs=[]
    for df in dfs:
        validate_column_in_dataframe(df,column_name)
        sorted_dfs.append(df.sort_values(by=column_name,ascending=True))
    return sorted_dfs



In [None]:
df_list=[sampled_dos_df,sampled_fuzzy_df,sampled_attack_free_df,sampled_attack_free_inside_dos_df,sampled_attack_free_inside_fuzzy_df]
column_name="timestamp"
sorted_dfs= sort_multiple_dfs_by_column(df_list,column_name)


In [15]:
sorted_dos_df,sorted_fuzzy_df,sorted_attack_free_df,sorted_attack_free_inside_dos_df,sorted_attack_free_inside_fuzzy_df=sorted_dfs

### Convert data types

#### timestamp

In [None]:
def convert_column_timestamp_to_datetime(df,column_name,new_column_name):
    #ten digit timestamp suggessts seconds since epoch
    """
    Convert a Unix timestamp to a datetime object and add it as a new column to the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing the timestamp column.
    column_name : str
        The name of the column containing the Unix timestamp.
    new_column_name : str
        The name of the new column to add to the DataFrame.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the new column added.
    """
 
    validate_column_in_dataframe(df,column_name)
    
    df[new_column_name]=pd.to_datetime(df[column_name],unit='s')
    return df

In [None]:
def convert_multiple_dfs_timestamp_to_datetime(dfs,column_name,new_column_name):
    """
    Convert a Unix timestamp to a datetime object and add it as a new column to each DataFrame in the list.

    Parameters
    ----------
    dfs : list
        List of DataFrames containing the timestamp column.
    column_name : str
        The name of the column containing the Unix timestamp.
    new_column_name : str
        The name of the new column to add to the DataFrame.

    Returns
    -------
    list
        List of DataFrames with the new column added.
    """
    converted_dfs=[]
    for df in dfs:
        converted_dfs.append(convert_column_timestamp_to_datetime(df,column_name,new_column_name))
    return converted_dfs

In [None]:
convert_column_timestamp_to_datetime(sorted_dos_df,"timestamp","datetime")

Unnamed: 0,timestamp,can_id,dlc,byte_0,byte_1,byte_2,byte_3,byte_4,byte_5,byte_6,byte_7,updated_flag,datetime
1479,1.478198e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 18:39:37.186119080
1487,1.478198e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 18:39:37.188112974
1517,1.478198e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 18:39:37.195993900
1529,1.478198e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 18:39:37.199111938
1569,1.478198e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 18:39:37.219604969
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2675494,1.478201e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 19:17:30.705996037
2675620,1.478201e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 19:17:31.302994013
2675627,1.478201e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 19:17:31.304994106
2675650,1.478201e+09,0000,8,00,00,00,00,00,00,00,00,T,2016-11-03 19:17:31.392564058
