In [None]:
from data_handling.data_ingestion import DataIngestion
from data_handling.data_preprocessing import DataPreprocessor
import os

DATA_FOLDER = 'data'
DATASETS_TO_INGEST = ['NF_UNSW_NB15', 'NF_ToN_IoT', 'NF_UNSW_NB15'] #, 'NF_ToN_IoT', 'NF_UNSW_NB15']
TEST_PREPROCESSING_AND_BUILD_STANDARDIZATION_STATISTICS = True

# Define the directories for the different stages of the data pipeline
raw_data_dir = f'{DATA_FOLDER}/raw'
landed_data_dir = f'{DATA_FOLDER}/landed'
ingested_data_dir = f'{DATA_FOLDER}/ingested'
utils_data_dir = f'{DATA_FOLDER}/utils'

# Create the data ingestion object
data_ingestor = DataIngestion(raw_data_dir, landed_data_dir, ingested_data_dir, utils_data_dir)

print('Starting data ingestion...')

# Land the datasets if not already landed
for dataset in DATASETS_TO_INGEST:
    print(f'Landing {dataset} dataset...')
    if dataset == 'NF_ToN_IoT':
        data_ingestor.land_raw_NF_ToN_IoT_files()
    elif dataset == 'NF_BoT_IoT':
        data_ingestor.land_raw_NF_BoT_IoT_files()
    elif dataset == 'NF_UNSW_NB15':
        data_ingestor.land_raw_NF_UNSW_NB15_files()
    else:
        print(f'Error: Landing and ingestion routine not yet developed for dataset {dataset}.')
# Ingest the datasets if not already ingested
for dataset in DATASETS_TO_INGEST:
    print(f'Ingesting {dataset} dataset...')
    data_ingestor.ingest_attack_files(dataset) # Ingest binary and mixed files for supervised learning
    data_ingestor.ingest_all_train_files(dataset) # Ingest all training files for unsupervised learning

print('Data ingestion complete.')

if TEST_PREPROCESSING_AND_BUILD_STANDARDIZATION_STATISTICS:
    print('Testing preprocessing and building standardization statistics...')

    # Create the data preprocessor object
    data_processor = DataPreprocessor(ingested_data_dir, utils_data_dir)

    for dataset_name in DATASETS_TO_INGEST:
        print(f'- for supervised {dataset_name} dataset...')
        attack_mapping = data_processor.load_attack_mapping(dataset_name)
        train_raw, val_raw, = data_processor.load_mixed_train(dataset_name), data_processor.load_mixed_val(dataset_name)
        (train_attrs, train_labels), (val_attrs, val_labels) = data_processor.preprocess_NF(dataset_name, train_raw, keep_IPs_and_timestamp=True, binary=False, remove_minority_labels=False, only_attacks=False, scale=True, truncate=True), \
                            data_processor.preprocess_NF(dataset_name, val_raw, keep_IPs_and_timestamp=True, binary=False, remove_minority_labels=False, only_attacks=False, scale=True, truncate=True)
    
    for dataset_name in DATASETS_TO_INGEST:
        print(f'- for pretraining {dataset_name} dataset...')
        attack_mapping = data_processor.load_attack_mapping(dataset_name)
        all_train_files_and_indices = data_processor.get_all_train_files_and_indices('NF_ToN_IoT')
        # Take first file for testing
        all_train_file, all_train_file_idx = all_train_files_and_indices[0]
        dataset = data_processor.load_all_train(dataset_name, all_train_file_idx, 0.1)
        preprocessed_train_attrs, labels = data_processor.preprocess_NF('all', dataset, keep_IPs_and_timestamp=True, binary=False, remove_minority_labels=False, only_attacks=False, scale=True, truncate=True)


Testing preprocessing and building standardization statistics for pretraining NF_UNSW_NB15 dataset...
- Loading in all training set file nr 0...
--- Dropping 5802 rows with NaNs before preprocessing:
-- Unique values for one-hot encoding columns not found. Finding these values and writing to file...


  ohe[f'{attribute}_{value}'] = 0


-- Min-Max scaling ranges not found. Calculating these ranges and writing to file...
-- Min-Max scaling numerical columns...
Testing preprocessing and building standardization statistics for pretraining NF_ToN_IoT dataset...
- Loading in all training set file nr 0...
-- Min-Max scaling numerical columns...
Testing preprocessing and building standardization statistics for pretraining NF_UNSW_NB15 dataset...
- Loading in all training set file nr 0...
--- Dropping 5446 rows with NaNs before preprocessing:
-- Min-Max scaling numerical columns...
