In [1]:
def process_one_file(file_path, real_data_path, output_path):
    import numpy as np
    import pandas as pd
    from scipy.stats import ks_2samp, wasserstein_distance
    from scipy.spatial.distance import jensenshannon
    from sklearn.metrics.pairwise import rbf_kernel
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, f1_score
    from sklearn.model_selection import train_test_split
    import tensorflow as tf
    import os

    def approximate_diversity(X, n_samples=1000000, random_state=42):
        np.random.seed(random_state)
        n = X.shape[0]
        idx_1 = np.random.randint(0, n, size=n_samples)
        idx_2 = np.random.randint(0, n, size=n_samples)
        mask = idx_1 != idx_2
        return np.mean(np.linalg.norm(X[idx_1[mask]] - X[idx_2[mask]], axis=1))

    def vendi_score(X, gamma=1e-5, normalize=True):
        K = rbf_kernel(X, gamma=gamma)
        if normalize:
            trace = np.trace(K)
            if trace == 0:
                raise ValueError("Trace nulle.")
            K /= trace
        eigvals = np.linalg.eigvalsh(K)
        eigvals = np.clip(eigvals, 1e-12, 1.0)
        entropy = -np.sum(eigvals * np.log(eigvals))
        return np.exp(entropy)

    # nom de base sans extension
    nom = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n🟢 Traitement de : {nom}")

    data = pd.read_csv(real_data_path)
    synthetic_data = pd.read_csv(file_path)
    real_data = data.copy()

    for col in real_data.columns:
        if real_data[col].dtype == bool:
            real_data[col] = real_data[col].astype(float)
            synthetic_data[col] = synthetic_data[col].astype(float)

    # 1. Tests de distributions
    print("▶️  [1] KS / Wasserstein / JSD")
    ks_results = []
    equal_count, different_count = 0, 0
    for column in real_data.columns:
        if column == 'target':
            continue
        real_values = real_data[column].values
        synthetic_values = synthetic_data[column].values
        result = {'Feature': column}
        ks_statistic, ks_p_value = ks_2samp(real_values, synthetic_values)
        result['KS Statistic'] = ks_statistic
        result['KS P-value'] = ks_p_value
        equal_count += int(ks_p_value > 0.05)
        different_count += int(ks_p_value <= 0.05)
        result['Wasserstein Distance'] = wasserstein_distance(real_values, synthetic_values)
        hist_range = (min(real_values.min(), synthetic_values.min()), max(real_values.max(), synthetic_values.max()))
        p_hist, _ = np.histogram(real_values, bins=100, range=hist_range, density=True)
        q_hist, _ = np.histogram(synthetic_values, bins=100, range=hist_range, density=True)
        p_hist += 1e-8
        q_hist += 1e-8
        p_hist /= p_hist.sum()
        q_hist /= q_hist.sum()
        result['Jensen-Shannon Divergence'] = jensenshannon(p_hist, q_hist, base=2) ** 2
        ks_results.append(result)

    ks_results_df = pd.DataFrame(ks_results)

    # 2. CB Diff
    print("▶️  [2] Distribution des classes (CB Diff)")
    real_dist = real_data['target'].value_counts(normalize=True)
    synth_dist = synthetic_data['target'].value_counts(normalize=True)
    all_classes = sorted(set(real_dist.index) | set(synth_dist.index))
    cb_diff = np.sum(np.abs(real_dist.reindex(all_classes, fill_value=0) - synth_dist.reindex(all_classes, fill_value=0))) * 100

    # 3.1 TSTR - RandomForest
    print("▶️  [3.1] TSTR - RandomForestClassifier")
    X = real_data.drop(columns='target')
    y = real_data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_synthetic = synthetic_data.drop(columns='target').reindex(columns=X_test.columns)
    y_synthetic = synthetic_data['target']
    model_rf = RandomForestClassifier(random_state=42)
    model_rf.fit(X_synthetic, y_synthetic)
    y_pred_rf = model_rf.predict(X_test)
    tstr_rf_acc = accuracy_score(y_test, y_pred_rf)
    tstr_rf_f1 = f1_score(y_test, y_pred_rf, average='weighted')

    # 3.2 TSTR - MLP avec TensorFlow (avec ModelCheckpoint)
    print("▶️  [3.2] TSTR - MLP (Keras)")
    X_synthetic_np = X_synthetic.values
    y_synthetic_np = y_synthetic.values
    X_test_np = X_test.values
    y_test_np = y_test.values

    input_dim = X_synthetic_np.shape[1]
    num_classes = len(np.unique(y_test_np))
    loss_fn = 'sparse_categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy'
    activation_out = 'softmax' if num_classes > 2 else 'sigmoid'

    model_mlp = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation=activation_out)
    ])

    model_mlp.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=loss_fn, metrics=['accuracy'])

    checkpoint_path = "./temp_best_mlp.weights.h5"
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path, monitor='val_accuracy', save_best_only=True, save_weights_only=True, verbose=0
    )

    model_mlp.fit(
        X_synthetic_np, y_synthetic_np,
        validation_data=(X_test_np, y_test_np),
        epochs=50,
        batch_size=128,
        callbacks=[checkpoint_cb],
        verbose=0
    )

    model_mlp.load_weights(checkpoint_path)

    y_pred_probs = model_mlp.predict(X_test_np, verbose=0)
    y_pred_mlp = np.argmax(y_pred_probs, axis=1) if num_classes > 2 else (y_pred_probs > 0.5).astype(int).flatten()
    tstr_mlp_acc = accuracy_score(y_test_np, y_pred_mlp)
    tstr_mlp_f1 = f1_score(y_test_np, y_pred_mlp, average='weighted')

    os.remove(checkpoint_path)


    # 4. Approximate Diversity (écart relatif pondéré)
    print("▶️  [4] Approximate Diversity (écart relatif pondéré)")

    gap_weighted_total = 0
    weight_total = 0
    all_classes = sorted(set(real_data['target'].unique()) | set(synthetic_data['target'].unique()))

    for cls in all_classes:
        real_cls = real_data[real_data['target'] == cls]
        synth_cls = synthetic_data[synthetic_data['target'] == cls]

        if len(real_cls) < 2 or len(synth_cls) < 2:
            continue

        X_real_cls = real_cls.drop(columns='target').values
        X_synth_cls = synth_cls.drop(columns='target').values

        div_real_cls = approximate_diversity(X_real_cls)
        div_synth_cls = approximate_diversity(X_synth_cls)
        gap_cls = (div_synth_cls - div_real_cls) / div_real_cls * 100

        weight = len(real_cls) / len(real_data)
        gap_weighted_total += weight * gap_cls
        weight_total += weight

    div_gap = gap_weighted_total / weight_total


    # 5. Vendi Score
    print("▶️  [5] Vendi Score")
    n_real = min(3000, len(X))
    n_synth = min(3000, len(X_synthetic))
    X_real_sub = X.sample(n=n_real, random_state=42).values
    X_synth_sub = X_synthetic.sample(n=n_synth, random_state=42).values
    vendi_real = vendi_score(X_real_sub)
    vendi_synth = vendi_score(X_synth_sub)
    vendi_gap = (vendi_synth - vendi_real) / vendi_real * 100

    # 6. Moyenne / Variance
    print("▶️  [6] Moyenne et variance des features synthétiques")
    synth_mean = X_synthetic.mean().mean()
    synth_var = X_synthetic.var().mean()

    # Résumé
    summary = {
        'Feature': 'GLOBAL',
        'KS Statistic': ks_results_df['KS Statistic'].mean(),
        'KS P-value': ks_results_df['KS P-value'].mean(),
        'Wasserstein Distance': ks_results_df['Wasserstein Distance'].mean(),
        'Jensen-Shannon Divergence': ks_results_df['Jensen-Shannon Divergence'].mean(),
        '# Features': len(X.columns),
        '# Similaires (p > 0.05)': equal_count,
        '# Différentes': different_count,
        'PD (%)': 100 * different_count / len(X.columns),
        'CB Diff (%)': cb_diff,
        'TSTR RF Accuracy': tstr_rf_acc,
        'TSTR RF F1-score': tstr_rf_f1,
        'TSTR MLP Accuracy': tstr_mlp_acc,
        'TSTR MLP F1-score': tstr_mlp_f1,
        'Diversity Gap (%)': div_gap,
        'Vendi Score (real)': vendi_real,
        'Vendi Score (synth)': vendi_synth,
        'Vendi Gap (%)': vendi_gap,
        'Mean (synth)': synth_mean,
        'Variance (synth)': synth_var
    }

    ks_results_df = pd.concat([ks_results_df, pd.DataFrame([summary])], ignore_index=True)
    output_file = os.path.join(output_path, f"results_{nom}.csv")
    print(f"💾  Sauvegarde dans : {output_file}")
    ks_results_df.to_csv(output_file, index=False)
    print("✅  Terminé.\n")
    #Clear memory
    del model_mlp, model_rf
    from tensorflow.keras import backend as K
    import gc
    K.clear_session()
    gc.collect()
    return ks_results_df


In [3]:
ks_results_df = process_one_file("./generations_cicids/cluster_vae_df_eps_0pt5.csv", "./dataset/cicids2017_clean_all_labels.csv", "./results_cicids")
#ks_results_df = process_one_file("./generations_nsl/cluster_vae_df_eps_1pt75.csv", "./dataset/kdd_full_clean_5classes.csv", "./results_nsl")

2025-04-17 13:52:20.005311: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-17 13:52:20.412904: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744890740.591819     896 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744890740.636789     896 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744890741.035314     896 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 


🟢 Traitement de : cluster_vae_df_eps_0pt5
▶️  [1] KS / Wasserstein / JSD
▶️  [2] Distribution des classes (CB Diff)
▶️  [3.1] TSTR - RandomForestClassifier
▶️  [3.2] TSTR - MLP (Keras)


I0000 00:00:1744891931.440319     896 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1768 MB memory:  -> device: 0, name: NVIDIA RTX A1000 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
I0000 00:00:1744891933.928098   43758 service.cc:152] XLA service 0x7f7bb401d610 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744891933.928134   43758 service.cc:160]   StreamExecutor device (0): NVIDIA RTX A1000 Laptop GPU, Compute Capability 8.6
2025-04-17 14:12:13.971244: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1744891934.085878   43758 cuda_dnn.cc:529] Loaded cuDNN version 90701











I0000 00:00:1744891938.899520   43758 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

























KeyboardInterrupt: 

In [None]:
from glob import glob
from tqdm import tqdm

real_data_path = "./dataset/kdd_full_clean_5classes.csv"
input_folder = "./generations"
output_folder = "./results"

files = glob(f"{input_folder}/*.csv")
for file_path in tqdm(files):
    process_one_file(file_path, real_data_path, output_folder)


  0%|          | 0/20 [00:00<?, ?it/s]


🟢 Traitement de : cluster_vae_df_eps_0pt5
▶️  [1] KS / Wasserstein / JSD
▶️  [2] Distribution des classes (CB Diff)
▶️  [3.1] TSTR - RandomForestClassifier


  0%|          | 0/20 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [13]:
import pandas as pd
from glob import glob
import os

def load_global_summary(file_path):
    """
    Load a single result CSV file and extract the 'GLOBAL' summary row.

    Parameters:
    - file_path (str): Path to the results_*.csv file.

    Returns:
    - pd.DataFrame or None: A one-row DataFrame with the model's summary metrics,
      or None if the file doesn't contain a 'GLOBAL' row.
    """
    df = pd.read_csv(file_path)
    global_row = df[df['Feature'] == 'GLOBAL'].copy()

    if global_row.empty:
        print(f"❌ No 'GLOBAL' row found in {file_path}")
        return None

    # Extract model name from filename
    model_name = os.path.basename(file_path).replace("results_", "").replace(".csv", "")
    global_row = global_row.reset_index(drop=True)
    global_row.insert(0, "Model", model_name)  # Add model name as first column
    return global_row


def load_all_global_summaries(results_folder):
    """
    Aggregate all 'GLOBAL' summary rows from results_*.csv files in a directory.

    Parameters:
    - results_folder (str): Path to folder containing results_*.csv files.

    Returns:
    - pd.DataFrame or None: A merged DataFrame with one row per model,
      or None if no valid files found.
    """
    files = sorted(glob(f"{results_folder}/results_*.csv"))
    all_summaries = []

    for f in files:
        summary = load_global_summary(f)
        if summary is not None:
            all_summaries.append(summary)

    if not all_summaries:
        print("⚠️ No global summaries found.")
        return None

    # Merge all individual summary rows
    summary_df = pd.concat(all_summaries, ignore_index=True)

    # Organize columns: 'Model' first, others sorted alphabetically
    fixed_cols = ['Model']
    metric_cols = sorted([col for col in summary_df.columns if col not in fixed_cols])
    summary_df = summary_df[fixed_cols + metric_cols]

    # Sort rows by model name
    summary_df = summary_df.sort_values(by='Model').reset_index(drop=True)

    print(f"✅ Summaries loaded: {len(summary_df)} models")
    return summary_df


In [15]:
summary_df = load_all_global_summaries("./results_cicids")

if summary_df is not None:
    print(summary_df.head())

    summary_path = "./global_results_cicids.csv"
    summary_df.to_csv(summary_path, index=False)
    print(f"💾 Fichier de synthèse sauvegardé dans : {summary_path}")


✅ Summaries loaded: 57 models
                              Model  # Différentes  # Features  \
0   cluster_vae_df_e20_k40_eps_0pt5           66.0        78.0   
1   cluster_vae_df_e20_k40_eps_1pt0           66.0        78.0   
2  cluster_vae_df_e20_k40_eps_1pt25           66.0        78.0   
3   cluster_vae_df_e20_k40_eps_1pt5           66.0        78.0   
4  cluster_vae_df_e20_k40_eps_1pt75           66.0        78.0   

   # Similaires (p > 0.05)  CB Diff (%)  Diversity Gap (%) Feature  \
0                     12.0     0.000191         -51.410853  GLOBAL   
1                     12.0     0.000191           8.365395  GLOBAL   
2                     12.0     0.000191          39.303534  GLOBAL   
3                     12.0     0.000191          76.784478  GLOBAL   
4                     12.0     0.000191         120.399052  GLOBAL   

   Jensen-Shannon Divergence  KS P-value  KS Statistic  ...     PD (%)  \
0                   0.016333    0.153846      0.146846  ...  84.615385   
1   