In [64]:
import pandas as pd
from scipy.stats import entropy
import numpy as np

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [65]:
def prepare_dataset(df):
    df = df.dropna()
    
    client_ip = df['Destination'].value_counts().idxmax()
    df['packet_direction'] = (df['Source'] == client_ip).astype(int)

    protocol_dummies = pd.get_dummies(df["Protocol"], prefix='proto')
    df = pd.concat([df, protocol_dummies], axis=1)
    
    protocols = ['proto_DNS', 'proto_HTTP', 'proto_HTTP/JSON', 'proto_QUIC', 'proto_TCP', 'proto_TLSv1.2', 'proto_TLSv1.3', 'proto_WebSocket']
    for protocol in protocols:
        if protocol in df.columns:
            df[f'rolling_count_{protocol}'] = df[protocol].rolling(window=20).sum()
    
    df['entropy_packet_length'] = df['Length'].rolling(window=20).apply(entropy, raw=True).fillna(0)
    df['inter_arrival_time'] = df['Time'].diff().fillna(0)
    
    df['fourier_packet_time'] = np.abs(np.fft.fft(df['inter_arrival_time']))
    df['ema_packet_size'] = df['Length'].ewm(span=5, adjust=False).mean()
    
    df['packet_size_change'] = df['Length'].diff().fillna(0)
    df = calculate_time_since_last(df, 'Time', 'Protocol', 'HTTP') 
    df = calculate_time_since_last(df, 'Time', 'Protocol', 'DNS') 
    df = calculate_time_since_last(df, 'Time', 'Protocol', 'HTTP/JSON') 
    df = calculate_time_since_last(df, 'Time', 'Protocol', 'QUIC') 

    # Adding rolling average and median for time_since_last_DNS
    df['rolling_avg_time_since_last_DNS'] = df['time_since_last_DNS'].rolling(window=2000).mean().fillna(0)
    df['rolling_median_time_since_last_DNS'] = df['time_since_last_DNS'].rolling(window=2000).median().fillna(0)
    df['rolling_std_time_since_last_DNS'] = df['time_since_last_DNS'].rolling(window=2000).std().fillna(0)
    df['rolling_var_time_since_last_DNS'] = df['time_since_last_DNS'].rolling(window=2000).var().fillna(0)
    
    df = calculate_std_inter_arrival_time(df)
    

    # TODO: fix the Unnamed: 0 column
    return df.drop([
        'Unnamed: 0'
        ], axis=1)
    
def calculate_std_inter_arrival_time(df):
  df['std_inter_arrival_time'] = df['inter_arrival_time'].rolling(window=20).std().fillna(0)
  return df




def calculate_time_since_last(df, time_column, protocol_column, protocol_value):
    protocol_mask = df[protocol_column] == protocol_value
    df[f'last_time_{protocol_value}'] = df.loc[protocol_mask, time_column]
    df[f'last_time_{protocol_value}'] = df[f'last_time_{protocol_value}'].ffill()
    df[f'time_since_last_{protocol_value}'] = df[time_column] - df[f'last_time_{protocol_value}']

    return df


In [66]:
def balance_dataset(df, target_column):
    # Separate features and target variable
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Initialize the RandomOverSampler
    oversampler = RandomOverSampler(random_state=42)

    # Fit the oversampler and resample the data
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Combine the resampled features and target into a single DataFrame
    balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
    balanced_df[target_column] = y_resampled

    return balanced_df




In [67]:
for batch in range(1,5):

    df1 = pd.read_csv(f'../datasets/batch_{batch}/dataset_600_sec.csv')
    df2 = pd.read_csv(f'../datasets/batch_{batch}/dataset_600_sec (1).csv')
    df3 = pd.read_csv(f'../datasets/batch_{batch}/dataset_600_sec (2).csv')
    df4 = pd.read_csv(f'../datasets/batch_{batch}/dataset_600_sec (3).csv')
    df5 = pd.read_csv(f'../datasets/batch_{batch}/dataset_600_sec (4).csv')
    df6 = pd.read_csv(f'../datasets/batch_{batch}/dataset_600_sec (5).csv')

    df1 = prepare_dataset(df1)
    df2 = prepare_dataset(df2)
    df3 = prepare_dataset(df3)
    df4 = prepare_dataset(df4)
    df5 = prepare_dataset(df5)
    df6 = prepare_dataset(df6)

    df = pd.concat([df1, df2, df3, df4, df5, df6])
    df.dropna(inplace=True)

    df = balance_dataset(df, 'curr_state')

    df.to_csv(f'../datasets/batch_{batch}/combined.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['packet_direction'] = (df['Source'] == client_ip).astype(int)


In [68]:
df1 = pd.read_csv(f'../datasets/test/dataset_600_sec.csv')
df2 = pd.read_csv(f'../datasets/test/dataset_600_sec (1).csv')
df3 = pd.read_csv(f'../datasets/test/dataset_600_sec (2).csv')
df4 = pd.read_csv(f'../datasets/test/dataset_600_sec (3).csv')
df5 = pd.read_csv(f'../datasets/test/dataset_600_sec (4).csv')
df6 = pd.read_csv(f'../datasets/test/dataset_600_sec (5).csv')

df1 = prepare_dataset(df1)
df2 = prepare_dataset(df2)
df3 = prepare_dataset(df3)
df4 = prepare_dataset(df4)
df5 = prepare_dataset(df5)
df6 = prepare_dataset(df6)

df = pd.concat([df1, df2, df3, df4, df5, df6])
df.dropna(inplace=True)

df = balance_dataset(df, 'curr_state')

df.to_csv(f'../datasets/test/combined.csv', index=False)
