In [10]:
import warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppresses INFO (1), WARNING (2), and ERROR (3)
warnings.filterwarnings("ignore")

In [11]:
import warnings
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('./data/EVSE-B-HPC-Kernel-Events-Combined.csv')

In [16]:
data.rename(columns={"State": "ChargingState"}, inplace=True)

In [17]:
data.describe()

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_sb_inodes_requeue,writeback_writeback_single_inode,writeback_writeback_single_inode_start,writeback_writeback_start,writeback_writeback_wait,writeback_writeback_wait_iff_congested,writeback_writeback_wake_background,writeback_writeback_write_inode,writeback_writeback_write_inode_start,writeback_writeback_written
count,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,...,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0,8468.0
mean,2147.350217,0.0,0.0,0.0,0.0,0.0,140560500.0,5.096717,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,1840.029656,0.0,0.0,0.0,0.0,0.0,265381900.0,127.341742,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,278.573706,0.0,0.0,0.0,0.0,0.0,1660856.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1829.902628,0.0,0.0,0.0,0.0,0.0,5650357.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3689.615245,0.0,0.0,0.0,0.0,0.0,10760730.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5855.669471,0.0,0.0,0.0,0.0,0.0,973875800.0,5343.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Step 1: Convert known categorical columns to 'object' type
categorical_cols = ['Label', 'ChargingState', 'Scenario', 'Attack', 'interface']
for col in categorical_cols:
    if col in data.columns:
        data[col] = data[col].astype('object')

# Step 2: Downcast int64 to smaller int types to optimize performance
for col in data.select_dtypes(include='int64').columns:
    data[col] = pd.to_numeric(data[col], downcast='integer')

# Step 3: Drop any rows with missing values (if any) to avoid issues during model training
data_cleaned = data.dropna()

In [19]:
data_cleaned

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_wait_iff_congested,writeback_writeback_wake_background,writeback_writeback_write_inode,writeback_writeback_write_inode_start,writeback_writeback_written,ChargingState,Attack,Scenario,Label,interface
0,5.001477,0,0,0,0,0,693371795,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
1,5.001487,0,0,0,0,0,699964025,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
2,5.001641,0,0,0,0,0,549770341,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
3,5.003762,0,0,0,0,0,571970875,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
4,10.065740,0,0,0,0,0,553199786,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6161,283.646045,0,0,0,0,0,5463807,0,0,0,...,0,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp
6162,288.714173,0,0,0,0,0,10977108,0,0,0,...,0,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp
6163,293.790086,0,0,0,0,0,3683292,0,0,0,...,0,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp
6164,298.861925,0,0,0,0,0,5165840,0,0,0,...,0,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp


In [20]:
# Step 2: Define metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=data)

metadata.update_column(column_name="ChargingState", sdtype="categorical")

# Step 3: Initialize and train the synthesizer
ctgan = GaussianCopulaSynthesizer(metadata)
ctgan.fit(data)

# Step 4: Generate synthetic data (same size as original)
new_data = ctgan.sample(num_rows=int(len(data) * 0.5))

# Step 5: Combine the original and synthetic data
augmented_df = pd.concat([data, new_data], ignore_index=True)


In [21]:
augmented_df["ChargingState"].unique()

array(['Charging', 'idle'], dtype=object)

In [None]:
# Save the augmented dataset
augmented_df.to_csv("data/EVSE_augmented_1.5.csv", index=False)

In [None]:
data_new = pd.read_csv('./data/EVSE_augmented_1.5.csv')

In [None]:
data_new.head()

In [None]:
data["Label"].unique()

In [None]:
cases = len(data)
benign_count = len(data[data.Label == "benign"])
attack_count = len(data[data.Label == "attack"])
anomolies_percentage = round(attack_count/cases*100, 2)

print(cl('CASE COUNT', attrs = ['bold']))
print(cl('--------------------------------------------', attrs = ['bold']))
print(cl('Total number of cases are {}'.format(cases), attrs = ['bold']))
print(cl('Number of Benign cases are {}'.format(benign_count), attrs = ['bold']))
print(cl('Number of attack cases are {}'.format(attack_count), attrs = ['bold']))
print(cl('Percentage of attack cases is {}'.format(anomolies_percentage), attrs = ['bold']))
print(cl('--------------------------------------------', attrs = ['bold']))