In [None]:
import pandas as pd

from load import (file_exists, RAW_FILE, FILE_NAMES, load_dataset, split_dataset)
from util.data_keys import Datakeys as dk

from os import cpu_count
import time

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import make_scorer

In [None]:
tmp_cores: int | None = cpu_count()
CORES: int = 4 if tmp_cores is None else tmp_cores
RANDOM_SEED: int = 1137
TEST_SIZE: float = 0.2

ML_FILE_PATH: str = "data/ml"
MANIPULATED_PATH: str = "data/manipulated"

In [None]:
if not file_exists(FILE_NAMES[0]):
        split_dataset()

In [None]:
def get_models_dict() -> dict:
    return {
        "Random Forest": RandomForestClassifier(62, n_jobs=CORES),
        "Gradient Boost": GradientBoostingClassifier(),
        "SGD Classifier": SGDClassifier(),
        "Benoulli NB": BernoulliNB(),
        "Linear SVC": LinearSVC(),
        "Gaussian NB": GaussianNB()
    }

In [None]:
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if (column == dk.ATTACK_TYPE.value):
            continue

        df[column] = (df[column] - df[column].mean()) / df[column].std()

    return df

In [None]:
def fit_algorithms(models: dict, X_train: pd.DataFrame, y_train: pd.Series) -> None:
    for name, model in models.items():

        start: float = time.time()
        model.fit(X_train, y_train)

        print(f"{name} fit time: {time.time() - start}")

In [None]:
def clean_up_df(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(
        columns=[
            # string column
            dk.INTENSITY.value,

            # answer columns
            dk.NUMBER_OF_BLACK_HOLES.value,
            dk.BLACK_HOLE_SWAP_PROB.value,
            dk.TARGETS_PER_BLACK_HOLE.value,

            # Constant columns
            dk.REQUESTS.value,
            dk.PARAMETER.value,
            dk.TOPOLOGY.value,
            dk.TOTAL_NO_PATHS.value,
            dk.NUMBER_OF_NODES.value,

            # Redundant features
            dk.TOTAL_REQUEST_FAILS.value, # TOTAL_REQUEST_SUCCESS
            dk.TOTAL_SWAPPING_FAILS.value, # TOTAL_SWAPPING_FAILS

            # feature don't worry
            dk.SIMULATION_TIME.value,
        ]
    )

In [None]:
all_df: pd.DataFrame = load_dataset(RAW_FILE)

normal_df: pd.DataFrame = all_df.loc[
    all_df[dk.NUMBER_OF_BLACK_HOLES.value] == 0
].sample(10_000, random_state=RANDOM_SEED)

normal_df = clean_up_df(normal_df)

In [None]:
def predict_models_with_cross_validation(models: dict, X: pd.DataFrame, y: pd.Series) -> dict[str, dict]:
    data: dict = {}

    for name, model in models.items():
        start = time.time()
        
        scoring = {
            'accuracy': 'accuracy', 
            'f1_score': make_scorer(f1_score, average='macro'),
            'precision_score': make_scorer(precision_score, zero_division=0),
            'recall_score': make_scorer(recall_score),
        }

        results: dict = cross_validate(model, X, y, cv=5, scoring=scoring)

        total_time = time.time() - start
        print(f"{name} cross time: {total_time}")
        
        data[name] = {test_name: result for test_name, result in results.items()}

    return data

In [None]:
for file_name in FILE_NAMES:
    current_file_name: str = MANIPULATED_PATH + "/" + file_name
    print("-"*60)
    print(current_file_name)
    attack_df: pd.DataFrame = load_dataset(current_file_name)
    attack_df = clean_up_df(attack_df)

    analyzis_df: pd.DataFrame = pd.concat(objs=[attack_df, normal_df])
    analyzis_df = normalize_df(analyzis_df)

    X: pd.DataFrame = analyzis_df.drop(columns=dk.ATTACK_TYPE.value)
    y: pd.Series = analyzis_df[dk.ATTACK_TYPE.value]

    models: dict = get_models_dict()
    results: dict[str, dict] = predict_models_with_cross_validation(models, X, y)
    
    result_mean: dict = {}
    mean_score_dict: dict = {}

    for model_name, tmp_result in results.items():
        result_mean = {}

        for name_result, array_result in tmp_result.items():
            result_mean[name_result] = array_result.mean()

        mean_score_dict[model_name] = result_mean

    df_results: pd.DataFrame = pd.DataFrame(mean_score_dict).T
    
    df_results.reset_index(inplace=True)

    df_results = df_results.drop(columns=["score_time", "fit_time"])
    
    df_results.rename(
        columns={
            'index': 'Model',
            "test_accuracy": "Accuracy",
            "test_f1_score": "F1 Score",
            "test_precision_score": "Precision",
            "test_recall_score": "Recall"
        }, 
        inplace=True
        )
    
    df_results.to_csv(ML_FILE_PATH + "/" + file_name, mode="w", index=False)
