In [1]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
from sklearn.covariance import EllipticEnvelope
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from datetime import datetime
import os
import pickle
import gc
import pandas as pd
import numpy as np
import psutil

In [2]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

## Carregar dataset

In [3]:
dataset_dir_path = './../Datasets/CSE-CIC-IDS2018/raw'
files_name_list = os.listdir(dataset_dir_path)
os.listdir(dataset_dir_path)

['03-02-2018.csv',
 '02-14-2018.csv',
 '02-22-2018.csv',
 '03-01-2018.csv',
 '02-16-2018.csv',
 '02-23-2018.csv',
 '02-20-2018.csv',
 '02-15-2018.csv',
 '.ipynb_checkpoints',
 '02-21-2018.csv',
 '02-28-2018.csv']

In [4]:
df_dataset_list = []
for file_name in files_name_list:
    if file_name.endswith('.csv'):
        file_path = os.path.join(dataset_dir_path, file_name)
        df = pd.read_csv(file_path, low_memory=False)
        df_dataset_list.append(df)

In [5]:
dataset = pd.concat(df_dataset_list, ignore_index=True)

In [6]:
# file_pattern = os.path.join(dataset_dir_path,'02-14-2018.csv')
# dataset = pd.read_csv(file_pattern)

In [7]:
initial_columns = dataset.columns
print(f"Num colunas: {len(initial_columns)}")

Num colunas: 84


In [8]:
initial_columns

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [9]:
columns = initial_columns

In [10]:
dataset["Label"].value_counts()

Label
Benign                      13484708
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193360
SSH-Bruteforce                187589
Infilteration                 161934
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Label                             59
Name: count, dtype: int64

In [11]:
treatment_functions = {}

## Redimensionamento da coluna Alvo

In [12]:
target_classes = dataset["Label"].value_counts().to_dict()

Caso as classes não estejam separadas em 0 (Benígno) e 1 (Malígno) é feito esse redimensionamento

In [13]:
classes = target_classes.keys()

if not (0 in classes and 1 in classes and len(classes) == 2):
    for target in target_classes:
        value = 0 if target == "Benign"else 1
        dataset.replace(to_replace=target, value=value, inplace=True)

In [14]:
dataset["Label"].value_counts()

Label
0    13484708
1     2748294
Name: count, dtype: int64

## Tratamento de dados ruidosos

In [None]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

# Deleta a coluna Timestamp
dataset.drop(columns=['Timestamp'], inplace=True)
columns = np.delete(columns, 2)

dataset.dropna(inplace=True)

In [None]:
print(f"dataset num columns: {len(dataset.columns)}")
print(f"columns lenght: {len(columns)}")

## Conversão de tipos incorretos

In [None]:
#Iterate throught each dataframe in the dictionary
for key in dataset.keys():
    for col in dataset.columns:
        #Check if the datatype of the column is object
        if dataset[col].dtype == 'object' and col != 'Label':
            # Change all values to numeric, and to NaN if it is a strig
            dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

In [None]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
dataset.dropna(inplace=True)

## Particionamento estratificado dos dados

In [None]:
x_train_columns = columns
x_test_columns = columns

In [None]:
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, stratify=dataset['Label'], random_state=42)

In [None]:
X_train = dataset_train.drop(columns=['Label'])
x_train_columns = np.delete(x_train_columns, -1)
Y_train = dataset_train['Label']

X_test = dataset_test.drop(columns=['Label'])
x_test_columns = np.delete(x_test_columns, -1)
Y_test = dataset_test['Label']

In [None]:
print(len(X_test))
print(len(Y_test))

In [None]:
print(f"dataset num columns: {len(X_train.columns)}")
print(f"columns lenght: {len(x_train_columns)}")

## Substituição de dados faltantes

In [None]:
# Inicializar o KNNImputer
# imputer = KNNImputer(n_neighbors=5)

# Inicializar o SimpleImputer com a estratégia 'median'
imputer = SimpleImputer(strategy='median')

In [None]:
# Aplicar a imputação nos dados de treino
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
print(f"dataset num columns: {len(X_train_imputed[0])}")
print(f"columns lenght: {len(x_train_columns)}")

In [None]:
treatment_functions['inputer'] = imputer.transform

## Transformação Yeo-Johnson

In [None]:
transformer = PowerTransformer(method='yeo-johnson')

In [None]:
# Ajuste e transformação nos dados de treino
X_train_transformed = transformer.fit_transform(X_train_imputed)

# Transformação nos dados de teste
X_test_transformed = transformer.transform(X_test_imputed)

In [None]:
print(f"dataset num columns: {len(X_train_transformed[0])}")
print(f"columns lenght: {len(x_train_columns)}")

In [None]:
treatment_functions['transformer'] = transformer.transform

## Normalização do dataset

In [None]:
scaler = MinMaxScaler()

In [None]:
# Ajuste e transformação nos dados de treino
X_train_normalized = scaler.fit_transform(X_train_transformed)

# Transformação nos dados de teste
X_test_normalized = scaler.transform(X_test_transformed)

In [None]:
print(f"dataset num columns: {len(X_train_normalized[0])}")
print(f"columns lenght: {len(x_train_columns)}")

In [None]:
treatment_functions['scaler'] = scaler.transform

## Remoção de outliers

In [None]:
# Criação do modelo Elliptic Envelope
ee = EllipticEnvelope(contamination=0.01) 

# Ajuste do modelo aos dados de treino normalizados
ee.fit(X_train_normalized)

# Identificar os outliers nos dados de treino
y_pred_train = ee.predict(X_train_normalized)

# Remover os outliers dos dados de treino
X_train_no_outliers = X_train_normalized[y_pred_train == 1]
Y_train_no_outliers = Y_train[y_pred_train == 1]

In [None]:
print(f"dataset num columns: {len(X_train_no_outliers[0])}")
print(f"columns lenght: {len(x_train_columns)}")

## Remoção de multicolinearidade

In [None]:
import numpy as np
import pandas as pd

class RemoveMulticollinearity:    
    def __init__(self, threshold=0.9):  # Ajuste o threshold conforme necessário
        self.threshold = threshold

    def fit(self, X, y=None):
        # Convert arrays to DataFrame for correlation computation
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if y is not None and isinstance(y, np.ndarray):
            y = pd.Series(y)

        if y is None:
            corr_X = X.corr()
        else:
            data = X.copy()
            data['Label'] = y
            corr_matrix = data.corr()
            corr_X, corr_y = corr_matrix.iloc[:-1, :-1], corr_matrix.iloc[:-1, -1]

        self.drop_ = set()
        for col in corr_X.columns:
            # Select columns that are correlated above the threshold
            corr = corr_X[col][corr_X[col] >= self.threshold]

            # Always finds itself with correlation 1
            if len(corr) > 1:
                if y is None:
                    # Drop all but the first one
                    self.drop_.update(list(corr.index[1:]))
                else:
                    # Keep feature with the highest correlation with y
                    keep = corr_y[corr.index].idxmax()
                    self.drop_.update(list(corr.index.drop(keep)))

        self.drop_ = list(self.drop_)
        return self

    def transform(self, X):
        # Convert arrays to DataFrame for dropping columns
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        return X.drop(columns=self.drop_, errors='ignore').values

    def filter_column_names(self, columns):
        columns_filtered = np.delete(columns, self.drop_)
        # print(f"Colunas originais: {columns}")
        # print(f"Colunas após filtragem: {columns_filtered}")
        return columns_filtered

In [None]:
# Aplicar ao conjunto de treino
remove_multicollinearity = RemoveMulticollinearity(threshold=0.9)
remove_multicollinearity.fit(X_train_no_outliers, Y_train_no_outliers)
X_train_no_multicollinearity = remove_multicollinearity.transform(X_train_no_outliers)

In [None]:
len(X_train_no_multicollinearity[0])

In [None]:
len(x_train_columns) - len(remove_multicollinearity.drop_)

In [None]:
columns_filtered = remove_multicollinearity.filter_column_names(x_train_columns)
len(columns_filtered)

In [None]:
print(f"dataset num columns: {len(X_train_no_multicollinearity[0])}")
print(f"columns lenght: {len(columns_filtered)}")

In [None]:
treatment_functions['remove_multicolinearity'] = remove_multicollinearity.transform

In [None]:
treatment_functions['filter_column_names'] = remove_multicollinearity.filter_column_names

## Balanceamento de Classes

In [None]:
# Instanciar o SMOTE
smote = SMOTE(random_state=42)

# Aplicar o SMOTE aos dados de treino
X_train_balanced, Y_train_balanced = smote.fit_resample(X_train_no_multicollinearity, Y_train_no_outliers)

In [None]:
len(X_train_balanced[0])

In [None]:
print(f"dataset num columns: {len(X_train_balanced[0])}")
print(f"columns lenght: {len(columns_filtered)}")

## Salvar dataset pre-processado

In [None]:
current_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
dataset_treated_dir_path = f'./../Datasets/CSE-CIC-IDS2018/pre-processed/{current_datetime}'
os.makedirs(dataset_treated_dir_path, exist_ok=True)

In [None]:
df_train = pd.DataFrame(X_train_balanced, columns=columns_filtered)
df_train['Label'] = Y_train_balanced.values

In [None]:
ultima_coluna = X_test_normalized[:, -1]

In [None]:
num_ones = np.sum(ultima_coluna == 0)

In [None]:
num_ones

In [None]:
train_filename = f'train_dataset_treated.parquet'
train_file_path = os.path.join(dataset_treated_dir_path, train_filename)

df_train.to_parquet(train_file_path, index=False)

In [None]:
X_test_no_multicollinearity = remove_multicollinearity.transform(X_test_normalized)

In [None]:
df_test = pd.DataFrame(X_test_no_multicollinearity, columns=columns_filtered)
df_test['Label'] = Y_test.values

In [None]:
test_filename = f'test_dataset_treated.parquet'
test_file_path = os.path.join(dataset_treated_dir_path, test_filename)

df_test.to_parquet(test_file_path, index=False)

## Salvar funções geradas pelo pré-processamento

In [None]:
dict_functions_file_path = os.path.join(dataset_treated_dir_path,'pre_processing_functions.pkl')

with open(dict_functions_file_path, 'wb') as file:
    pickle.dump(treatment_functions, file)