In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast
from pathlib import Path
import openpyxl
import itertools
import torch

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances, make_scorer, precision_score, f1_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, friedmanchisquare, wilcoxon
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from ctgan import CTGAN

### Read datasets

In [None]:
# Original df
original = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\encoded_normalized\\original_data_normalized.csv")

# KMeans + centeroid
borderline_KM_cent = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_KM_centroids.csv")
smote_KM_cent = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_KM_centroids.csv")
mix_KM_cent = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_KM_centroids.csv")
GAN_KM_cent = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_KM_centroids.csv")

# KMeans + the nearesrt neighbor
borderline_KM_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_KM_nn.csv")
smote_KM_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_KM_nn.csv")
mix_KM_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_KM_nn.csv")
GAN_KM_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_KM_nn.csv")

# KMeans + cosine similarity
borderline_KM_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_KM_cos.csv")
smote_KM_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_KM_cos.csv")
mix_KM_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_KM_cos.csv")
GAN_KM_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_KM_cos.csv")

# KMeans + cosine similarity + Mahalanobis distance
borderline_KM_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_KM_cos_mal.csv")
smote_KM_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_KM_cos_mal.csv")
mix_KM_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_KM_cos_mal.csv")
GAN_KM_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_KM_cos_mal.csv")

# HDBSCAN + the nearesrt neighbor
borderline_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_HDBSCAN_NN.csv")
smote_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_HDBSCAN_NN.csv")
mix_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_HDBSCAN_NN.csv")
GAN_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_HDBSCAN_NN.csv")

# HDBSCAN + cosine similarity
borderline_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_HDBSCAN_cos.csv")
smote_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_HDBSCAN_cos.csv")
mix_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_HDBSCAN_cos.csv")
GAN_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_HDBSCAN_cos.csv")

# Kmeans(number of samples calculated by HDBSCAN) + the nearesrt neighbor
borderline_KM_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_kmeans&hdbscan_nn.csv")
smote_KM_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_kmeans&hdbscan_nn.csv")
mix_KM_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_kmeans&hdbscan_nn.csv")
GAN_KM_HDBSCAN_nn = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_kmeans&hdbscan_nn.csv")

# Kmeans(number of samples calculated by HDBSCAN) + cosine similarity
borderline_KM_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_kmeans&hdbscan_cos.csv")
smote_KM_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_kmeans&hdbscan_cos.csv")
mix_KM_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_kmeans&hdbscan_cos.csv")
GAN_KM_HDBSCAN_cos = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_kmeans&hdbscan_cos.csv")  

# KMeans(number of samples calculated by HDBSCAN) + cosine similarity + Mahalanobis distance
borderline_KM_HDBSCAN_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\borderline_kmeans&hdbscan_cos&mal.csv")
smote_KM_HDBSCAN_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\smote_kmeans&hdbscan_cos&mal.csv")
mix_KM_HDBSCAN_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\mix_kmeans&hdbscan_cos&mal.csv")
GAN_KM_HDBSCAN_cos_maha = pd.read_csv("D:\\ml\\undersampling_data\\data\\unsw\\reduced\\GAN_kmeans&hdbscan_cos&mal.csv")

In [None]:
data = {}

# Data for grid search
""" data["original"] = (original.drop(columns=["target","source"]), original["target"])
data["KM_C"] = (borderline_KM_cent.drop(columns=["target","source"]), borderline_KM_cent["target"])
data["KM_NN"] = (borderline_KM_nn.drop(columns=["target","source"]), borderline_KM_nn["target"])
data["KM_COS"] = (borderline_KM_cos.drop(columns=["target","source"]), borderline_KM_cos["target"])
data["KM_COS_MAHA"] = (borderline_KM_cos_maha.drop(columns=["target","source"]), borderline_KM_cos_maha["target"])
data["HDBSCAN_NN"] = (borderline_HDBSCAN_nn.drop(columns=["target","source"]), borderline_HDBSCAN_nn["target"])
data["HDBSCAN_COS"] = (borderline_HDBSCAN_cos.drop(columns=["target","source"]), borderline_HDBSCAN_cos["target"])
data["KM_SCAN_NN"] = (borderline_KM_HDBSCAN_nn.drop(columns=["target","source"]), borderline_KM_HDBSCAN_nn["target"])
data["KM_SCAN_COS"] = (borderline_KM_HDBSCAN_cos.drop(columns=["target","source"]), borderline_KM_HDBSCAN_cos["target"])
data["KM_SCAN_COS_MAHA"] = (borderline_KM_HDBSCAN_cos_maha.drop(columns=["target","source"]), borderline_KM_HDBSCAN_cos_maha["target"]) """

# Data for models
data["DT"] = (original.drop(columns=["target","source"]), original["target"])
data["DT_KM_C"] = (borderline_KM_cent.drop(columns=["target","source"]), borderline_KM_cent["target"])
data["DT_KM_NN"] = (borderline_KM_nn.drop(columns=["target","source"]), borderline_KM_nn["target"])
data["DT_KM_COS"] = (borderline_KM_cos.drop(columns=["target","source"]), borderline_KM_cos["target"])
data["DT_KM_COS_MAHA"] = (borderline_KM_cos_maha.drop(columns=["target","source"]), borderline_KM_cos_maha["target"])
data["DT_HDBSCAN_NN"] = (borderline_HDBSCAN_nn.drop(columns=["target","source"]), borderline_HDBSCAN_nn["target"])
data["DT_HDBSCAN_COS"] = (borderline_HDBSCAN_cos.drop(columns=["target","source"]), borderline_HDBSCAN_cos["target"])
data["DT_KM_SCAN_NN"] = (borderline_KM_HDBSCAN_nn.drop(columns=["target","source"]), borderline_KM_HDBSCAN_nn["target"])
data["DT_KM_SCAN_COS"] = (borderline_KM_HDBSCAN_cos.drop(columns=["target","source"]), borderline_KM_HDBSCAN_cos["target"])
data["DT_KM_SCAN_COS_MAHA"] = (borderline_KM_HDBSCAN_cos_maha.drop(columns=["target","source"]), borderline_KM_HDBSCAN_cos_maha["target"])

data["RF"] = (original.drop(columns=["target","source"]), original["target"])
data["RF_KM_C"] = (smote_KM_cent.drop(columns=["target","source"]), smote_KM_cent["target"])
data["RF_KM_NN"] = (smote_KM_nn.drop(columns=["target","source"]), smote_KM_nn["target"])
data["RF_KM_COS"] = (smote_KM_cos.drop(columns=["target","source"]), smote_KM_cos["target"])
data["RF_KM_COS_MAHA"] = (smote_KM_cos_maha.drop(columns=["target","source"]), smote_KM_cos_maha["target"])
data["RF_HDBSCAN_NN"] = (smote_HDBSCAN_nn.drop(columns=["target","source"]), smote_HDBSCAN_nn["target"])
data["RF_HDBSCAN_COS"] = (smote_HDBSCAN_cos.drop(columns=["target","source"]), smote_HDBSCAN_cos["target"])
data["RF_KM_SCAN_NN"] = (smote_KM_HDBSCAN_nn.drop(columns=["target","source"]), smote_KM_HDBSCAN_nn["target"])
data["RF_KM_SCAN_COS"] = (smote_KM_HDBSCAN_cos.drop(columns=["target","source"]), smote_KM_HDBSCAN_cos["target"])
data["RF_KM_SCAN_COS_MAHA"] = (smote_KM_HDBSCAN_cos_maha.drop(columns=["target","source"]), smote_KM_HDBSCAN_cos_maha["target"])

data["XGB"] = (original.drop(columns=["target","source"]), original["target"])
data["XGB_KM_C"] = (mix_KM_cent.drop(columns=["target","source"]), mix_KM_cent["target"])
data["XGB_KM_NN"] = (mix_KM_nn.drop(columns=["target","source"]), mix_KM_nn["target"])
data["XGB_KM_COS"] = (mix_KM_cos.drop(columns=["target","source"]), mix_KM_cos["target"])
data["XGB_KM_COS_MAHA"] = (mix_KM_cos_maha.drop(columns=["target","source"]), mix_KM_cos_maha["target"])
data["XGB_HDBSCAN_NN"] = (mix_HDBSCAN_nn.drop(columns=["target","source"]), mix_HDBSCAN_nn["target"])
data["XGB_HDBSCAN_COS"] = (mix_HDBSCAN_cos.drop(columns=["target","source"]), mix_HDBSCAN_cos["target"])
data["XGB_KM_SCAN_NN"] = (mix_KM_HDBSCAN_nn.drop(columns=["target","source"]), mix_KM_HDBSCAN_nn["target"])
data["XGB_KM_SCAN_COS"] = (mix_KM_HDBSCAN_cos.drop(columns=["target","source"]), mix_KM_HDBSCAN_cos["target"])
data["XGB_KM_SCAN_COS_MAHA"] = (mix_KM_HDBSCAN_cos_maha.drop(columns=["target","source"]), mix_KM_HDBSCAN_cos_maha["target"])

data["XGBRF"] = (original.drop(columns=["target","source"]), original["target"])
data["XGBRF_KM_C"] = (GAN_KM_cent.drop(columns=["target","source"]), GAN_KM_cent["target"])
data["XGBRF_KM_NN"] = (GAN_KM_nn.drop(columns=["target","source"]), GAN_KM_nn["target"])
data["XGBRF_KM_COS"] = (GAN_KM_cos.drop(columns=["target","source"]), GAN_KM_cos["target"])
data["XGBRF_KM_COS_MAHA"] = (GAN_KM_cos_maha.drop(columns=["target","source"]), GAN_KM_cos_maha["target"])
data["XGBRF_HDBSCAN_NN"] = (GAN_HDBSCAN_nn.drop(columns=["target","source"]), GAN_HDBSCAN_nn["target"])
data["XGBRF_HDBSCAN_COS"] = (GAN_HDBSCAN_cos.drop(columns=["target","source"]), GAN_HDBSCAN_cos["target"])
data["XGBRF_KM_SCAN_NN"] = (GAN_KM_HDBSCAN_nn.drop(columns=["target","source"]), GAN_KM_HDBSCAN_nn["target"])
data["XGBRF_KM_SCAN_COS"] = (GAN_KM_HDBSCAN_cos.drop(columns=["target","source"]), GAN_KM_HDBSCAN_cos["target"])
data["XGBRF_KM_SCAN_COS_MAHA"] = (GAN_KM_HDBSCAN_cos_maha.drop(columns=["target","source"]), GAN_KM_HDBSCAN_cos_maha["target"])

### Grid Search

In [None]:
params_ = {
    "DT": {
        'max_depth': [4, 8],
        'min_samples_leaf': [1, 2],
        'random_state': [0]
    },
    "RF": {
        'n_estimators': [40, 80],
        'max_depth': [8, 12],
        'min_samples_leaf': [1],
        'random_state': [0]
    },
    "XGB": {
        'max_depth': [8, 12],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.7, 1]
    },
    "XGBRF": {
        'n_estimators': [40, 80],
        'max_depth': [8],
        'subsample': [0.7, 1],
        'random_state': [0]
    }
}

models_ = {
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "XGB": XGBClassifier(verbosity=0, use_label_encoder=False),
    "XGBRF": XGBClassifier(booster='gbtree', grow_policy='depthwise', importance_type='gain', tree_method='auto', verbosity=0, use_label_encoder=False)
}

results_ = {}

n_iter = 30

scoring_metrics = {
    'precision': 'precision',
    'accuracy': 'accuracy',
    'f1-score': 'f1',
    'roc_auc': 'roc_auc'
}


for name, (X, y) in data.items():
    print(f"\n🟦 Dataset: {name}")
    results_[name] = {}

    for model_name in models_.keys():
        print(f" 🔍 Model: {model_name}")
        model = models_[model_name]
        param_dist = params_[model_name]

        results_[name][model_name] = {}

        for metric_name, metric in scoring_metrics.items():
            print(f"    📊 Metric: {metric_name}")
            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_dist,
                n_iter=20,
                scoring=metric,
                cv=10,
                n_jobs=-1,
                verbose=0,
                random_state=42,
                return_train_score=False
            )

            search.fit(X, y)

            results_[name][model_name][metric_name] = {
                'best_score': search.best_score_,
                'mean_test_score': np.mean(search.cv_results_['mean_test_score']),
                'best_params': search.best_params_
            }

            print(f"      ✅ Best score: {search.best_score_:.4f}")
            print(f"      📈 Mean score: {np.mean(search.cv_results_['mean_test_score']):.4f}")
            
            
summary = []

for dataset_name in results_:
    for model_name in results_[dataset_name]:
        for metric_name in results_[dataset_name][model_name]:
            entry = results_[dataset_name][model_name][metric_name]
            summary.append({
                "Dataset": dataset_name,
                "Model": model_name,
                "Metric": metric_name,
                "Best Score": entry["best_score"],
                "Mean Score": entry["mean_test_score"],
                "Best Params": entry["best_params"]
            })

df_results = pd.DataFrame(summary)
df_results.to_csv("D:\\ml\\undersampling_data\\reports\\unsw\\grid_search\\results_summary.csv", index=False)

In [None]:
# Variables
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
results_cross_mean_under=[]
results_cross_std_under=[]
goal = ['accuracy', 'precision','f1','recall']
path_files = ["D:\\ml\\undersampling_data\\reports\\unsw\\results\\accuracy_metrics.txt",
              "D:\\ml\\undersampling_data\\reports\\unsw\\results\\precision_metrics.txt", 
              "D:\\ml\\undersampling_data\\reports\\unsw\\results\\f1_metrics.txt", 
              "D:\\ml\\undersampling_data\\reports\\unsw\\results\\recall_metrics.txt"]

excel_file_cross = "D:\\ml\\undersampling_data\\reports\\unsw\\results\\cross.xlsx"
excel_file_cross2 = "D:\\ml\\undersampling_data\\reports\\unsw\\results\\cross_std.xlsx"
name_sheet1 = "Arkusz1"

raw_scores = {metric: {} for metric in goal}

# Loop 10x10 cross validation for all models and all datasets
for metrix1, path_file in zip(goal, path_files):
    for model_key, (X_train, y_train) in data.items():
        model = models[model_key]
        
        # Cross validation
        cross_val_results = cross_val_score(model, X_train, y_train, cv=cv, scoring=metrix1, n_jobs=1)
        print(f"Learn: {metrix1} and model {model_key}")
        
        raw_scores[metrix1][model_key] = cross_val_results
        
        # Save results to file
        with open(path_file, "a+") as f:
            # Zapis wyników do pliku
            print(f'{model_key} Cross-Validation Results {metrix1}:\n {cross_val_results}', file=f)
            print(f'Mean {metrix1}: {cross_val_results.mean()}', file=f)
            print(f'Dev: {cross_val_results.std()}', file=f)
            print("\n", file=f)
        
        # Collect results for DataFrame
        results_cross_mean_under.append({
        "Model": model_key,
        "Metric": metrix1,
        "Result": cross_val_results.mean()})
        results_cross_std_under.append({
        "Model": model_key,
        "Metric": metrix1,
        "Std": cross_val_results.std()})
        
        model.fit(X_train, y_train)
        with open(f"D:\\ml\\undersampling_data\\models\\unsw\\{model_key}_{metrix1}_model.pkl", "wb") as f:
            pickle.dump(model, f)
    
    # Save raw scores to CSV        
    df_raw = pd.DataFrame(raw_scores[metrix1])
    df_raw.to_csv(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\raw_scores_{metrix1}.csv", index=False)  
          
# Create DataFrames and save to Excel           
df_results_cross_under_mean = pd.DataFrame(results_cross_mean_under)
df_results_cross_under_std = pd.DataFrame(results_cross_std_under)

df_save_under_mean = df_results_cross_under_mean.pivot(index="Metric", columns="Model", values="Result")
df_save_under_std = df_results_cross_under_std.pivot(index="Metric", columns="Model", values="Std")

with pd.ExcelFile(excel_file_cross) as w:
    df_save_under_mean.to_excel(w, sheet_name=name_sheet1)
with pd.ExcelFile(excel_file_cross2) as w1:
    df_save_under_std.to_excel(w1, sheet_name=name_sheet1)

# Visualization results for all models and all datasets
for metric in goal:
    # Choose metric
    df_metric_mean = df_save_under_mean.loc[metric]   # mean
    df_metric_std = df_save_under_std.loc[metric]     # std
    
    max_std = df_metric_std.max()
    min_mean = df_metric_mean.min()
    ymin = round((min_mean - (max_std + 0.05)),1)

    plt.figure(figsize=(10,5))
    plt.errorbar(
        df_metric_mean.index,                # X: modele
        df_metric_mean.values,               # Y: mean
        yerr=df_metric_std.values,           # std
        fmt="o",                             # marker
        ecolor="red", capsize=3,             # color and capsize for error bars
        color="blue", label=metric
    )

    plt.title(f"Cross-validation mean scores ± std for {metric} metric")
    plt.ylim(ymin, 1)
    plt.ylabel("Score")
    plt.xticks(rotation=90)
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\{metric}_chart.png", dpi=300)
    plt.show()

### Statistical 

In [None]:
row_acc = pd.read_csv(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\raw_scores_accuracy.csv")  
row_prec = pd.read_csv(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\raw_scores_precision.csv")
row_f1 = pd.read_csv(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\raw_scores_f1.csv")
row_recall = pd.read_csv(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\raw_scores_recall.csv")

results_cv = {}

results_cv["accuracy"] = row_acc
results_cv["precision"] = row_prec
results_cv["f1"] = row_f1
results_cv["recall"] = row_recall

for name, df in results_cv.items():
    #print(f"Metric: {name}")
    #print(df.describe())
    
    anova_res = f_oneway(*(df[model].dropna().values for model in df.columns))
    print(f"Results for ANOVA: {anova_res}\n")
        
    friedman_res = friedmanchisquare(*(df[model].dropna().values for model in df.columns))
    print("\nFriedman test results:")
    print(friedman_res)
    
    
    
    # 4. 📊 Boxplot
    plt.figure(figsize=(8,6))
    sns.boxplot(data=df)
    plt.title(f"Boxplot for {name} metric")
    plt.ylabel("Score")
    plt.xticks(rotation=90)
    plt.tight_layout()
    #plt.savefig(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\stat_analyze\\{name}_boxplot.png", dpi=300)
    plt.show()

    # 5. 🎻 Violinplot
    plt.figure(figsize=(8,6))
    sns.violinplot(data=df, inner="box")
    plt.title(f"Violinplot for {name} metric")
    plt.ylabel("Score")
    plt.xticks(rotation=90)
    plt.tight_layout()
    #plt.savefig(f"D:\\ml\\undersampling_data\\reports\\unsw\\results\\stat_analyze\\{name}_violinplot.png", dpi=300)
    plt.show()
    
"""    
path = f"D:\\ml\\undersampling_data\\reports\\ssh\\results\\stat_analyze\\stat_results.txt"

    # --- zapis do pliku ---
with open(path, "w", encoding="utf-8") as fp:
    fp.write("Statistical analysis results (ANOVA & Friedman)\n")
    fp.write("="*80 + "\n\n")
    
    for metric_name, df in results_cv.items():
      # ANOVA
        anova_res = f_oneway(*(df[model].dropna().values for model in df.columns))
        # Friedman
        friedman_res = friedmanchisquare(*(df[model].dropna().values for model in df.columns))

        # zapis wyników
        fp.write(f"Metric: {metric_name}\n")
        fp.write("-"*40 + "\n")
        fp.write(f"ANOVA results -> Statistic: {anova_res.statistic:.4f}, "
                 f"p-value: {anova_res.pvalue:.4e}\n")
        fp.write(f"Friedman results -> Statistic: {friedman_res.statistic:.4f}, "
                f"p-value: {friedman_res.pvalue:.4e}\n\n") """

# I focuse on f1-score metric, because it is the most important for security incidents detection. Recall is also important to avoid situation when we miss overlook the incident. Main metric is f1-score, the rest is additional information. 

In [None]:
def posthoc_wilcoxon_bonferroni(df_raw, metric_name):
    # --- Friedman test ---
    friedman_res = friedmanchisquare(*(df_raw[model].dropna().values for model in df_raw.columns))
    print(f"\n=== Friedman test for {metric_name} ===")
    print(f"Statistic={friedman_res.statistic:.3f}, p={friedman_res.pvalue:.3e}")

    results = []
    if friedman_res.pvalue <= 0.05:
        # --- Wilcoxon test dla każdej pary ---
        pairs = list(itertools.combinations(df_raw.columns, 2))
        for m1, m2 in pairs:
            stat, p = wilcoxon(df_raw[m1], df_raw[m2])
            results.append({"Model1": m1, "Model2": m2, "statistic": stat, "pvalue_raw": p})

        df_results = pd.DataFrame(results)

        # --- Korekcja Bonferroniego ---
        m = len(df_results)
        df_results["pvalue_corrected"] = (df_results["pvalue_raw"] * m).clip(upper=1.0)
        df_results["reject_H0"] = df_results["pvalue_corrected"] < 0.05
        return df_results
    else:
        print("Brak istotnych różnic wg Friedmana.")
        return None


def analyze_all_metrics(raw_scores, main_metric="f1"):
    """
    raw_scores: dict {"metric_name": DataFrame}, gdzie DataFrame: kolumny=modele, wiersze=foldy
    main_metric: metryka, która decyduje o najlepszym modelu (np. "f1")
    """
    best_models = {}

    with open("D:\\ml\\undersampling_data\\reports\\unsw\\results\\stat_analyze\\posthoc_summary.txt", "w", encoding="utf-8") as f:
        for metric_name, df_raw in results_cv.items():
            f.write(f"\n### Metric: {metric_name}\n")

            # statystyki opisowe
            desc = df_raw.describe().T[["mean", "std"]]
            f.write("Mean ± Std:\n")
            f.write(desc.to_string())
            f.write("\n")

            # testy post-hoc
            df_posthoc = posthoc_wilcoxon_bonferroni(df_raw, metric_name)
            if df_posthoc is not None:
                f.write("\nPost-hoc Wilcoxon + Bonferroni:\n")
                f.write(df_posthoc.to_string())
                f.write("\n")

            # wybór najlepszego modelu wg średniej
            top_model = desc["mean"].idxmax()
            best_models[metric_name] = top_model
            f.write(f"\nBest model for {metric_name}: {top_model}\n")
            f.write("="*60 + "\n")

    # wybór najlepszego modelu wg main_metric
    final_best = best_models.get(main_metric)
    print(f"\n>>> Najlepszy model wg {main_metric.upper()}: {final_best}")
    return final_best


# --- przykład użycia ---
# raw_scores = {"accuracy": df_raw_acc, "precision": df_raw_prec, "recall": df_raw_rec, "f1": df_raw_f1}
final_model = analyze_all_metrics(results_cv, main_metric="f1")
