In [73]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
from sklearn.covariance import EllipticEnvelope
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from datetime import datetime
import os
import pickle

In [2]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

## Carregar dataset

In [3]:
dataset_dir_path = './../Datasets/CSE-CIC-IDS2018/raw/original/'
files_name_list = os.listdir(dataset_dir_path)

In [4]:
df_dataset_list = []
for file_name in files_name_list:
    file_path = os.path.join(dataset_dir_path, file_name)
    df = pd.read_csv(file_path)
    df_dataset_list.append(df)

In [None]:
dataset = pd.concat(df_dataset_list, ignore_index=True)

In [4]:
# file_pattern = './../Datasets/CSE-CIC-IDS2018/raw/original/02-14-2018.csv'
# dataset = pd.read_csv(file_pattern)

In [5]:
dataset["Label"].value_counts()

Label
Benign            667626
FTP-BruteForce    193360
SSH-Bruteforce    187589
Name: count, dtype: int64

In [6]:
treatment_functions = {}

## Redimensionamento da coluna Alvo

In [7]:
target_classes = dataset["Label"].value_counts().to_dict()

Caso as classes não estejam separadas em 0 (Benígno) e 1 (Malígno) é feito esse redimensionamento

In [8]:
classes = target_classes.keys()

if not (0 in classes and 1 in classes and len(classes) == 2):
    for target in target_classes:
        value = 0 if target == "Benign"else 1
        dataset.replace(to_replace=target, value=value, inplace=True)

In [9]:
dataset["Label"].value_counts()

Label
0    667626
1    380949
Name: count, dtype: int64

In [10]:
columns = dataset.columns.values
columns = np.delete(columns, -1)

## Tratamento de dados ruidosos

In [11]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
dataset.drop(columns=['Timestamp'], inplace=True)
dataset.dropna(inplace=True)

In [None]:
columns = np.delete(columns, 2)

## Conversão de tipos incorretos

In [12]:
#Iterate throught each dataframe in the dictionary
for key in dataset.keys():
    for col in dataset.columns:
        #Check if the datatype of the column is object
        if dataset[col].dtype == 'object' and col != 'Label':
            # Change all values to numeric, and to NaN if it is a strig
            dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

## Particionamento estratificado dos dados

In [13]:
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, stratify=dataset['Label'], random_state=42)

In [14]:
X_train = dataset_train.drop(columns=['Label'])
Y_train = dataset_train['Label']

X_test = dataset_test.drop(columns=['Label'])
Y_test = dataset_test['Label']

## Substituição de dados faltantes

In [15]:
# Inicializar o KNNImputer
imputer = KNNImputer(n_neighbors=5)

In [16]:
# Aplicar a imputação nos dados de treino
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [17]:
treatment_functions['inputer'] = imputer.transform

## Transformação Yeo-Johnson

In [18]:
transformer = PowerTransformer(method='yeo-johnson')

In [19]:
# Ajuste e transformação nos dados de treino
X_train_transformed = transformer.fit_transform(X_train_imputed)

# Transformação nos dados de teste
X_test_transformed = transformer.transform(X_test_imputed)

In [20]:
treatment_functions['transformer'] = transformer.transform

## Normalização do dataset

In [21]:
scaler = MinMaxScaler()

In [22]:
# Ajuste e transformação nos dados de treino
X_train_normalized = scaler.fit_transform(X_train_transformed)

# Transformação nos dados de teste
X_test_normalized = scaler.transform(X_test_transformed)

In [23]:
treatment_functions['scaler'] = scaler.transform

## Remoção de outliers

In [24]:
# Criação do modelo Elliptic Envelope
ee = EllipticEnvelope(contamination=0.01) 

# Ajuste do modelo aos dados de treino normalizados
ee.fit(X_train_normalized)

# Identificar os outliers nos dados de treino
y_pred_train = ee.predict(X_train_normalized)

# Remover os outliers dos dados de treino
X_train_no_outliers = X_train_normalized[y_pred_train == 1]
Y_train_no_outliers = Y_train[y_pred_train == 1]



## Remoção de multicolinearidade

In [36]:
import numpy as np
import pandas as pd

class RemoveMulticollinearity:    
    def __init__(self, threshold=0.9):  # Ajuste o threshold conforme necessário
        self.threshold = threshold

    def fit(self, X, y=None):
        # Convert arrays to DataFrame for correlation computation
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if y is not None and isinstance(y, np.ndarray):
            y = pd.Series(y)

        if y is None:
            corr_X = X.corr()
        else:
            data = X.copy()
            data['Label'] = y
            corr_matrix = data.corr()
            corr_X, corr_y = corr_matrix.iloc[:-1, :-1], corr_matrix.iloc[:-1, -1]

        self.drop_ = set()
        for col in corr_X.columns:
            # Select columns that are correlated above the threshold
            corr = corr_X[col][corr_X[col] >= self.threshold]

            # Always finds itself with correlation 1
            if len(corr) > 1:
                if y is None:
                    # Drop all but the first one
                    self.drop_.update(list(corr.index[1:]))
                else:
                    # Keep feature with the highest correlation with y
                    keep = corr_y[corr.index].idxmax()
                    self.drop_.update(list(corr.index.drop(keep)))

        self.drop_ = list(self.drop_)
        return self

    def transform(self, X):
        # Convert arrays to DataFrame for dropping columns
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        return X.drop(columns=self.drop_, errors='ignore').values

    def filter_column_names(self, columns):
        columns_filtered = np.delete(columns, self.drop_)
        return columns_filtered

In [40]:
# Aplicar ao conjunto de treino
remove_multicollinearity = RemoveMulticollinearity(threshold=0.9)
remove_multicollinearity.fit(X_train_no_outliers, Y_train_no_outliers)
X_train_no_multicollinearity = remove_multicollinearity.transform(X_train_no_outliers)

In [41]:
columns_filtered = remove_multicollinearity.filter_column_names(columns)

In [43]:
treatment_functions['remove_multicolinearity'] = remove_multicollinearity.transform

In [72]:
treatment_functions['filter_column_names'] = remove_multicollinearity.filter_column_names

## Balanceamento de Classes

In [44]:
# Instanciar o SMOTE
smote = SMOTE(random_state=42)

# Aplicar o SMOTE aos dados de treino
X_train_balanced, Y_train_balanced = smote.fit_resample(X_train_no_multicollinearity, Y_train_no_outliers)

## Salvar dataset pre-processado

In [76]:
current_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
dataset_treated_dir_path = f'./../Datasets/CSE-CIC-IDS2018/pre-processed/{current_datetime}'
os.makedirs(dataset_treated_dir_path, exist_ok=True)

In [77]:
columns_filtered = np.delete(columns_filtered, 2)

In [83]:
len(columns_filtered)

35

In [None]:
df_train = pd.DataFrame(X_train_balanced, columns=columns_filtered)
df_train['Label'] = Y_train_balanced

In [79]:
train_filename = f'train_dataset_treated.csv'
train_file_path = os.path.join(dataset_treated_dir_path, train_filename)

df_train.to_parquet(train_file_path, index=False)

In [63]:
X_test_no_multicollinearity = remove_multicollinearity.transform(X_test_normalized)

In [64]:
df_test = pd.DataFrame(X_test_no_multicollinearity, columns=columns_filtered)
df_test['Label'] = Y_test

In [71]:
test_filename = f'test_dataset_treated.csv'
test_file_path = os.path.join(dataset_treated_dir_path, test_filename)

df_test.to_parquet(test_file_path, index=False)

## Salvar funções geradas pelo pré-processamento

In [75]:
dict_functions_file_path = os.path.join(dataset_treated_dir_path,'pre_processing_functions.pkl')

with open(dict_functions_file_path, 'wb') as file:
    pickle.dump(treatment_functions, file)