In [1]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

In [2]:
def load_dataset(file_path, file_name, nrows=10000000):
    full_path = os.path.join(file_path, file_name)
    data_set = pd.read_csv(full_path, header=0, nrows=nrows, dtype={'Column85': str}, low_memory=False)
    
    data_set.columns = data_set.columns.str.strip()
    
    return data_set

def get_samples_by_label(data_frame, label):    
    filtered_samples = data_frame[data_frame["Label"] == label]
    
    return filtered_samples

def delete_duplicates(data):
    print(data.duplicated().sum(), "fully duplicate rows to remove")
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    print(data.shape)
    return data

def combine_datasets(file_path, file_names, nrows=15000):
    
    dataframes = []
    
    for name in file_names:
        try:
            df = load_dataset(file_path, name, nrows)
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {os.path.join(file_path, name)}: {e}")
    
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    return combined_df


def calculate_label_distribution(dataframe, label_column):
    total_samples = dataframe.shape[0]
    label_counts = dataframe[label_column].value_counts()
    
    distribution = {}
    for label, count in label_counts.items():
        percentage = (count / total_samples) * 100
        distribution[label] = {
            'count': count,
            'percentage': percentage
        }
        
    return distribution

def display_label_distribution(label_distribution):
    for label, stats in label_distribution.items():
        print(f"Samples Count: {label}: {stats['count']}")
        print(f"Percentage distribution of data: {label}: {stats['percentage']:.2f}%")
    

def display_unique_labels(data_frame):
    unique_labels = data_frame["Label"].unique()
    print("Unique Labels:", unique_labels)

def display_dataset_length(data_frame):
    length = len(data_frame)
    print("Dataset length:", length)

def generate_smoth_samples(X_train, Y_train):
        # Initalize SMOTE
        smote = SMOTE(random_state=42)

        # Generate new samples
        X_resampled, y_resampled = smote.fit_resample(X_train, Y_train)

        # Check the distribution of classes after applying SMOTE
        print("Class distribution after application of SMOTE:")
        print(pd.Series(y_resampled).value_counts())

        return X_resampled, y_resampled


def map_label(df):
    # Mapping labels to numerical values
    label_mapping = {
        "BENIGN": 0,
        "NetBIOS": 1,
        "LDAP": 2,
        "MSSQL": 3,
        "Portmap": 4,
        "Syn": 5,
        "UDP": 6
    }
    df['Label'] = df['Label'].map(label_mapping)

    return df

def imputation_missing_number_values(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    columns_imputed = 0
    columns_dropped = 0

    columns_to_drop = []

    for column in df.select_dtypes(include=[np.number]).columns:
        missing_percentage = df[column].isnull().mean() * 100

        if missing_percentage > 0:
            if missing_percentage <= 20:
                median_value = df[column].median()
                df[column] = df[column].fillna(median_value)
                columns_imputed += 1
            else:
                columns_to_drop.append(column)
                columns_dropped += 1

    df.drop(columns=columns_to_drop, inplace=True)

    print(f"Imputation median in {columns_imputed} columns.")
    print(f"Deleted {columns_dropped} columns")

    return df

def trim_datataset(mapped_df, important_features_indices):
    # Create a new DataFrame with only important features
    trimmed_df = mapped_df.iloc[:, important_features_indices].copy()  # Keep only important features
    trimmed_df['Label'] = mapped_df['Label']  # Ensure to keep the label column

    return trimmed_df