In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "final_PESV_dataset_v3.csv")

df_read = pd.read_csv(FINAL_PESV_FILE)
df_read.head()

Unnamed: 0,filename,application,category,binary_type,alpha_pp_0,alpha_pp_1,alpha_pp_2,alpha_pp_3,alpha_pp_4,alpha_pp_5,...,burst_dur_p75,burst_idle_count,burst_idle_sum,burst_idle_mean,burst_idle_std,burst_idle_min,burst_idle_max,burst_idle_median,burst_idle_p25,burst_idle_p75
0,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,0.0,2.318271,0.0,0.0,0.0,0.0,...,0.196231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,0.0,3.087183,0.868822,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,0.0,2.294946,0.0,0.0,0.0,0.0,...,0.131098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,0.0,2.546194,0.713163,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,vpn_skype_files1b.pcap.UDP_10-8-8-130_49539_21...,Skype,File Transfer,VPN,0.0,0.433829,0.0,0.0,0.0,0.0,...,0.130659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# --- PESV v3 "Championship" Ablation Study (FIXED) ---
#
# This script loads the final 'final_PESV_dataset_v3.csv'
# and runs a "championship" ablation study.
#
# v2 FIX: The feature-finding logic has been corrected to
# match the actual column names from the assembly script.
# - alpha'' features start with 'alpha_pp_'
# - delta features start with 'c2s_', 's2c_', 'flow_', or 'total_'
# - gamma' features start with 'burst_'

print("--- Initializing PESV v3 Championship Ablation Study (FIXED) ---")

import pandas as pd
import numpy as np
import time
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- !! SET YOUR EXPERIMENT MODE !! ---
# "FULL"       - Runs on the full v3 dataset
# "VPN_ONLY"   - Runs on only the VPN samples
# "NONVPN_ONLY" - Runs on only the NonVPN samples
EXPERIMENT_MODE = "FULL"

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "final_PESV_dataset_v3.csv")

# Use RandomForest as the standard classifier for this study
CLASSIFIER_MODEL = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced", # Use this for simplicity and speed
    n_jobs=-1
)

TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Load Data and Define Feature Sets ---

def load_data(experiment_mode):
    """Loads and filters the dataset based on the experiment mode."""
    print(f"\n--- Loading Data for Mode: {experiment_mode} ---")
    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        return None, None

    df_full = pd.read_csv(FINAL_PESV_FILE)

    if experiment_mode == "FULL":
        df = df_full
    elif experiment_mode == "VPN_ONLY":
        df = df_full[df_full['binary_type'] == 'VPN'].copy()
    elif experiment_mode == "NONVPN_ONLY":
        df = df_full[df_full['binary_type'] == 'NonVPN'].copy()
    else:
        print(f"FATAL ERROR: Unknown experiment mode '{experiment_mode}'")
        return None, None

    if df.empty:
        print("FATAL ERROR: The filtered dataset is empty.")
        return None, None

    print(f"Loaded dataset with shape: {df.shape}")

    # --- Define Feature Column Groups (FIXED LOGIC) ---
    all_cols = set(df.columns)
    label_cols = {'filename', 'application', 'category', 'binary_type'}

    # FIX: Use actual column name patterns, not assumed prefixes

    # 1. Find Alpha'' (α'') columns
    alpha_pp_cols = sorted(list([c for c in all_cols if c.startswith('alpha_pp_')]))

    # 2. Find Delta (δ) columns
    delta_cols_set = set([c for c in all_cols if
                          c.startswith('c2s_') or
                          c.startswith('s2c_') or
                          c.startswith('flow_') or
                          c.startswith('total_')])
    delta_cols = sorted(list(delta_cols_set))

    # 3. Find Gamma' (γ') columns
    gamma_p_cols_set = set([c for c in all_cols if c.startswith('burst_')])
    gamma_p_cols = sorted(list(gamma_p_cols_set))

    # This is our master list of feature sets to test
    feature_sets = {
        "Alpha'' (α'') only": alpha_pp_cols,
        "Delta (δ) only": delta_cols,
        "Gamma' (γ') only": gamma_p_cols,

        "Alpha'' (α'') + Delta (δ)": alpha_pp_cols + delta_cols,
        "Alpha'' (α'') + Gamma' (γ')": alpha_pp_cols + gamma_p_cols,
        "Delta (δ) + Gamma' (γ')": delta_cols + gamma_p_cols, # The v2 Champion

        "Full (α'' + δ + γ')": alpha_pp_cols + delta_cols + gamma_p_cols, # The v3 Hybrid
    }

    # --- Sanity Check ---
    print(f"Found {len(alpha_pp_cols)} Alpha'' (α'') features.")
    print(f"Found {len(delta_cols)} Delta (δ) features.")
    print(f"Found {len(gamma_p_cols)} Gamma' (γ') features.")

    if not alpha_pp_cols or not delta_cols or not gamma_p_cols:
        print("FATAL ERROR: Could not find all feature columns.")
        print("Please check your v3 assembly script and column names.")
        # We return None to trigger the TypeError correctly
        return None, None

    return df, feature_sets

# --- PART 3: Classification Task Function ---

def run_classification_task(df, target_label, feature_set_name, feature_cols):
    """
    Runs a single classification pipeline for a given task and feature set.
    """
    print(f"\n--- Running Task ---")
    print(f"  Target: '{target_label}'")
    print(f"  Features: '{feature_set_name}' ({len(feature_cols)} features)")

    # --- 1. Prepare Data (X, y) ---
    X = df[feature_cols]
    y = df[target_label]

    class_labels = sorted(y.unique())

    # Handle NaN/Inf values (just in case)
    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y  # Stratify is always critical
    )

    # --- 3. Create Scikit-learn Pipeline ---
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CLASSIFIER_MODEL)
    ])

    # --- 4. Train the Model ---
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f}s.")

    # --- 5. Make Predictions & Evaluate ---
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Optional: Uncomment for full reports during execution
    # print("\nClassification Report:")
    # print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    return accuracy

# --- PART 4: Main Execution ---
def main():
    warnings.filterwarnings("ignore", category=UserWarning)

    df, feature_sets = load_data(EXPERIMENT_MODE)
    if df is None:
        # This will cause the TypeError you saw, which is correct
        # because the load_data function failed and returned None.
        return

    # --- Define all tasks to run ---
    # We skip 'binary_type' if we are in a filtered mode
    if EXPERIMENT_MODE == "FULL":
        tasks_to_run = ['binary_type', 'category', 'application']
    else:
        tasks_to_run = ['category', 'application']

    # This will store our final summary table
    summary_results = {}

    for task in tasks_to_run:
        print(f"\n{'='*70}")
        print(f"--- STARTING ALL EXPERIMENTS FOR TARGET: {task} ---")
        print(f"{'='*70}")

        task_results = {}
        for set_name, cols in feature_sets.items():
            if not cols:
                print(f"Skipping '{set_name}': No features found.")
                continue

            acc = run_classification_task(df, task, set_name, cols)
            task_results[set_name] = acc

        summary_results[task] = task_results

    # --- Final Summary Report ---
    # --- THIS IS THE FIX ---
    # The previous line was corrupted by an injected system message.
    print(f"\n{'='*70}")
    # --- END FIX ---
    print(f"--- FINAL ABLATION STUDY SUMMARY ({EXPERIMENT_MODE} Dataset) ---")
    print(f"{'='*70}\n")

    for task, results in summary_results.items():
        print(f"--- Target: {task} ---")
        # Sort results from best to worst
        sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        for set_name, acc in sorted_results:
            print(f"  {set_name:<30}: {acc * 100:.2f}% Accuracy")
        print("") # Newline for readability

    print("--- Ablation Study v3 Finished ---")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
    else:
        main()

--- Initializing PESV v3 Championship Ablation Study (FIXED) ---
All libraries imported successfully.

--- Loading Data for Mode: FULL ---
Loaded dataset with shape: (9542, 208)
Found 128 Alpha'' (α'') features.
Found 40 Delta (δ) features.
Found 36 Gamma' (γ') features.

--- STARTING ALL EXPERIMENTS FOR TARGET: binary_type ---

--- Running Task ---
  Target: 'binary_type'
  Features: 'Alpha'' (α'') only' (128 features)
Training complete in 4.44s.
Accuracy: 93.45%

--- Running Task ---
  Target: 'binary_type'
  Features: 'Delta (δ) only' (40 features)
Training complete in 1.71s.
Accuracy: 96.23%

--- Running Task ---
  Target: 'binary_type'
  Features: 'Gamma' (γ') only' (36 features)
Training complete in 1.35s.
Accuracy: 94.97%

--- Running Task ---
  Target: 'binary_type'
  Features: 'Alpha'' (α'') + Delta (δ)' (168 features)
Training complete in 2.02s.
Accuracy: 96.44%

--- Running Task ---
  Target: 'binary_type'
  Features: 'Alpha'' (α'') + Gamma' (γ')' (164 features)
Training com

In [None]:
# --- PESV v3 "Championship" Ablation Study (FIXED) ---
#
# This script loads the final 'final_PESV_dataset_v3.csv'
# and runs a "championship" ablation study.
#
# v2 FIX: The feature-finding logic has been corrected to
# match the actual column names from the assembly script.
# - alpha'' features start with 'alpha_pp_'
# - delta features start with 'c2s_', 's2c_', 'flow_', or 'total_'
# - gamma' features start with 'burst_'

print("--- Initializing PESV v3 Championship Ablation Study (FIXED) ---")

import pandas as pd
import numpy as np
import time
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- !! SET YOUR EXPERIMENT MODE !! ---
# "FULL"       - Runs on the full v3 dataset
# "VPN_ONLY"   - Runs on only the VPN samples
# "NONVPN_ONLY" - Runs on only the NonVPN samples
EXPERIMENT_MODE = "VPN_ONLY"

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "final_PESV_dataset_v3.csv")

# Use RandomForest as the standard classifier for this study
CLASSIFIER_MODEL = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced", # Use this for simplicity and speed
    n_jobs=-1
)

TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Load Data and Define Feature Sets ---

def load_data(experiment_mode):
    """Loads and filters the dataset based on the experiment mode."""
    print(f"\n--- Loading Data for Mode: {experiment_mode} ---")
    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        return None, None

    df_full = pd.read_csv(FINAL_PESV_FILE)

    if experiment_mode == "FULL":
        df = df_full
    elif experiment_mode == "VPN_ONLY":
        df = df_full[df_full['binary_type'] == 'VPN'].copy()
    elif experiment_mode == "NONVPN_ONLY":
        df = df_full[df_full['binary_type'] == 'NonVPN'].copy()
    else:
        print(f"FATAL ERROR: Unknown experiment mode '{experiment_mode}'")
        return None, None

    if df.empty:
        print("FATAL ERROR: The filtered dataset is empty.")
        return None, None

    print(f"Loaded dataset with shape: {df.shape}")

    # --- Define Feature Column Groups (FIXED LOGIC) ---
    all_cols = set(df.columns)
    label_cols = {'filename', 'application', 'category', 'binary_type'}

    # FIX: Use actual column name patterns, not assumed prefixes

    # 1. Find Alpha'' (α'') columns
    alpha_pp_cols = sorted(list([c for c in all_cols if c.startswith('alpha_pp_')]))

    # 2. Find Delta (δ) columns
    delta_cols_set = set([c for c in all_cols if
                          c.startswith('c2s_') or
                          c.startswith('s2c_') or
                          c.startswith('flow_') or
                          c.startswith('total_')])
    delta_cols = sorted(list(delta_cols_set))

    # 3. Find Gamma' (γ') columns
    gamma_p_cols_set = set([c for c in all_cols if c.startswith('burst_')])
    gamma_p_cols = sorted(list(gamma_p_cols_set))

    # This is our master list of feature sets to test
    feature_sets = {
        "Alpha'' (α'') only": alpha_pp_cols,
        "Delta (δ) only": delta_cols,
        "Gamma' (γ') only": gamma_p_cols,

        "Alpha'' (α'') + Delta (δ)": alpha_pp_cols + delta_cols,
        "Alpha'' (α'') + Gamma' (γ')": alpha_pp_cols + gamma_p_cols,
        "Delta (δ) + Gamma' (γ')": delta_cols + gamma_p_cols, # The v2 Champion

        "Full (α'' + δ + γ')": alpha_pp_cols + delta_cols + gamma_p_cols, # The v3 Hybrid
    }

    # --- Sanity Check ---
    print(f"Found {len(alpha_pp_cols)} Alpha'' (α'') features.")
    print(f"Found {len(delta_cols)} Delta (δ) features.")
    print(f"Found {len(gamma_p_cols)} Gamma' (γ') features.")

    if not alpha_pp_cols or not delta_cols or not gamma_p_cols:
        print("FATAL ERROR: Could not find all feature columns.")
        print("Please check your v3 assembly script and column names.")
        # We return None to trigger the TypeError correctly
        return None, None

    return df, feature_sets

# --- PART 3: Classification Task Function ---

def run_classification_task(df, target_label, feature_set_name, feature_cols):
    """
    Runs a single classification pipeline for a given task and feature set.
    """
    print(f"\n--- Running Task ---")
    print(f"  Target: '{target_label}'")
    print(f"  Features: '{feature_set_name}' ({len(feature_cols)} features)")

    # --- 1. Prepare Data (X, y) ---
    X = df[feature_cols]
    y = df[target_label]

    class_labels = sorted(y.unique())

    # Handle NaN/Inf values (just in case)
    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y  # Stratify is always critical
    )

    # --- 3. Create Scikit-learn Pipeline ---
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CLASSIFIER_MODEL)
    ])

    # --- 4. Train the Model ---
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f}s.")

    # --- 5. Make Predictions & Evaluate ---
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Optional: Uncomment for full reports during execution
    # print("\nClassification Report:")
    # print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    return accuracy

# --- PART 4: Main Execution ---
def main():
    warnings.filterwarnings("ignore", category=UserWarning)

    df, feature_sets = load_data(EXPERIMENT_MODE)
    if df is None:
        # This will cause the TypeError you saw, which is correct
        # because the load_data function failed and returned None.
        return

    # --- Define all tasks to run ---
    # We skip 'binary_type' if we are in a filtered mode
    if EXPERIMENT_MODE == "FULL":
        tasks_to_run = ['binary_type', 'category', 'application']
    else:
        tasks_to_run = ['category', 'application']

    # This will store our final summary table
    summary_results = {}

    for task in tasks_to_run:
        print(f"\n{'='*70}")
        print(f"--- STARTING ALL EXPERIMENTS FOR TARGET: {task} ---")
        print(f"{'='*70}")

        task_results = {}
        for set_name, cols in feature_sets.items():
            if not cols:
                print(f"Skipping '{set_name}': No features found.")
                continue

            acc = run_classification_task(df, task, set_name, cols)
            task_results[set_name] = acc

        summary_results[task] = task_results

    # --- Final Summary Report ---
    # --- THIS IS THE FIX ---
    # The previous line was corrupted by an injected system message.
    print(f"\n{'='*70}")
    # --- END FIX ---
    print(f"--- FINAL ABLATION STUDY SUMMARY ({EXPERIMENT_MODE} Dataset) ---")
    print(f"{'='*70}\n")

    for task, results in summary_results.items():
        print(f"--- Target: {task} ---")
        # Sort results from best to worst
        sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        for set_name, acc in sorted_results:
            print(f"  {set_name:<30}: {acc * 100:.2f}% Accuracy")
        print("") # Newline for readability

    print("--- Ablation Study v3 Finished ---")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
    else:
        main()

--- Initializing PESV v3 Championship Ablation Study (FIXED) ---
All libraries imported successfully.

--- Loading Data for Mode: VPN_ONLY ---
Loaded dataset with shape: (1897, 208)
Found 128 Alpha'' (α'') features.
Found 40 Delta (δ) features.
Found 36 Gamma' (γ') features.

--- STARTING ALL EXPERIMENTS FOR TARGET: category ---

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') only' (128 features)
Training complete in 0.95s.
Accuracy: 86.32%

--- Running Task ---
  Target: 'category'
  Features: 'Delta (δ) only' (40 features)
Training complete in 0.77s.
Accuracy: 91.05%

--- Running Task ---
  Target: 'category'
  Features: 'Gamma' (γ') only' (36 features)
Training complete in 0.96s.
Accuracy: 91.32%

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') + Delta (δ)' (168 features)
Training complete in 1.28s.
Accuracy: 90.26%

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') + Gamma' (γ')' (164 features)
Training complete in 1.12s

In [None]:
# --- PESV v3 "Championship" Ablation Study (FIXED) ---
#
# This script loads the final 'final_PESV_dataset_v3.csv'
# and runs a "championship" ablation study.
#
# v2 FIX: The feature-finding logic has been corrected to
# match the actual column names from the assembly script.
# - alpha'' features start with 'alpha_pp_'
# - delta features start with 'c2s_', 's2c_', 'flow_', or 'total_'
# - gamma' features start with 'burst_'

print("--- Initializing PESV v3 Championship Ablation Study (FIXED) ---")

import pandas as pd
import numpy as np
import time
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- !! SET YOUR EXPERIMENT MODE !! ---
# "FULL"       - Runs on the full v3 dataset
# "VPN_ONLY"   - Runs on only the VPN samples
# "NONVPN_ONLY" - Runs on only the NonVPN samples
EXPERIMENT_MODE = "NONVPN_ONLY"

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "final_PESV_dataset_v3.csv")

# Use RandomForest as the standard classifier for this study
CLASSIFIER_MODEL = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced", # Use this for simplicity and speed
    n_jobs=-1
)

TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Load Data and Define Feature Sets ---

def load_data(experiment_mode):
    """Loads and filters the dataset based on the experiment mode."""
    print(f"\n--- Loading Data for Mode: {experiment_mode} ---")
    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        return None, None

    df_full = pd.read_csv(FINAL_PESV_FILE)

    if experiment_mode == "FULL":
        df = df_full
    elif experiment_mode == "VPN_ONLY":
        df = df_full[df_full['binary_type'] == 'VPN'].copy()
    elif experiment_mode == "NONVPN_ONLY":
        df = df_full[df_full['binary_type'] == 'NonVPN'].copy()
    else:
        print(f"FATAL ERROR: Unknown experiment mode '{experiment_mode}'")
        return None, None

    if df.empty:
        print("FATAL ERROR: The filtered dataset is empty.")
        return None, None

    print(f"Loaded dataset with shape: {df.shape}")

    # --- Define Feature Column Groups (FIXED LOGIC) ---
    all_cols = set(df.columns)
    label_cols = {'filename', 'application', 'category', 'binary_type'}

    # FIX: Use actual column name patterns, not assumed prefixes

    # 1. Find Alpha'' (α'') columns
    alpha_pp_cols = sorted(list([c for c in all_cols if c.startswith('alpha_pp_')]))

    # 2. Find Delta (δ) columns
    delta_cols_set = set([c for c in all_cols if
                          c.startswith('c2s_') or
                          c.startswith('s2c_') or
                          c.startswith('flow_') or
                          c.startswith('total_')])
    delta_cols = sorted(list(delta_cols_set))

    # 3. Find Gamma' (γ') columns
    gamma_p_cols_set = set([c for c in all_cols if c.startswith('burst_')])
    gamma_p_cols = sorted(list(gamma_p_cols_set))

    # This is our master list of feature sets to test
    feature_sets = {
        "Alpha'' (α'') only": alpha_pp_cols,
        "Delta (δ) only": delta_cols,
        "Gamma' (γ') only": gamma_p_cols,

        "Alpha'' (α'') + Delta (δ)": alpha_pp_cols + delta_cols,
        "Alpha'' (α'') + Gamma' (γ')": alpha_pp_cols + gamma_p_cols,
        "Delta (δ) + Gamma' (γ')": delta_cols + gamma_p_cols, # The v2 Champion

        "Full (α'' + δ + γ')": alpha_pp_cols + delta_cols + gamma_p_cols, # The v3 Hybrid
    }

    # --- Sanity Check ---
    print(f"Found {len(alpha_pp_cols)} Alpha'' (α'') features.")
    print(f"Found {len(delta_cols)} Delta (δ) features.")
    print(f"Found {len(gamma_p_cols)} Gamma' (γ') features.")

    if not alpha_pp_cols or not delta_cols or not gamma_p_cols:
        print("FATAL ERROR: Could not find all feature columns.")
        print("Please check your v3 assembly script and column names.")
        # We return None to trigger the TypeError correctly
        return None, None

    return df, feature_sets

# --- PART 3: Classification Task Function ---

def run_classification_task(df, target_label, feature_set_name, feature_cols):
    """
    Runs a single classification pipeline for a given task and feature set.
    """
    print(f"\n--- Running Task ---")
    print(f"  Target: '{target_label}'")
    print(f"  Features: '{feature_set_name}' ({len(feature_cols)} features)")

    # --- 1. Prepare Data (X, y) ---
    X = df[feature_cols]
    y = df[target_label]

    class_labels = sorted(y.unique())

    # Handle NaN/Inf values (just in case)
    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y  # Stratify is always critical
    )

    # --- 3. Create Scikit-learn Pipeline ---
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CLASSIFIER_MODEL)
    ])

    # --- 4. Train the Model ---
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f}s.")

    # --- 5. Make Predictions & Evaluate ---
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Optional: Uncomment for full reports during execution
    # print("\nClassification Report:")
    # print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    return accuracy

# --- PART 4: Main Execution ---
def main():
    warnings.filterwarnings("ignore", category=UserWarning)

    df, feature_sets = load_data(EXPERIMENT_MODE)
    if df is None:
        # This will cause the TypeError you saw, which is correct
        # because the load_data function failed and returned None.
        return

    # --- Define all tasks to run ---
    # We skip 'binary_type' if we are in a filtered mode
    if EXPERIMENT_MODE == "FULL":
        tasks_to_run = ['binary_type', 'category', 'application']
    else:
        tasks_to_run = ['category', 'application']

    # This will store our final summary table
    summary_results = {}

    for task in tasks_to_run:
        print(f"\n{'='*70}")
        print(f"--- STARTING ALL EXPERIMENTS FOR TARGET: {task} ---")
        print(f"{'='*70}")

        task_results = {}
        for set_name, cols in feature_sets.items():
            if not cols:
                print(f"Skipping '{set_name}': No features found.")
                continue

            acc = run_classification_task(df, task, set_name, cols)
            task_results[set_name] = acc

        summary_results[task] = task_results

    # --- Final Summary Report ---
    # --- THIS IS THE FIX ---
    # The previous line was corrupted by an injected system message.
    print(f"\n{'='*70}")
    # --- END FIX ---
    print(f"--- FINAL ABLATION STUDY SUMMARY ({EXPERIMENT_MODE} Dataset) ---")
    print(f"{'='*70}\n")

    for task, results in summary_results.items():
        print(f"--- Target: {task} ---")
        # Sort results from best to worst
        sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        for set_name, acc in sorted_results:
            print(f"  {set_name:<30}: {acc * 100:.2f}% Accuracy")
        print("") # Newline for readability

    print("--- Ablation Study v3 Finished ---")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
    else:
        main()

--- Initializing PESV v3 Championship Ablation Study (FIXED) ---
All libraries imported successfully.

--- Loading Data for Mode: NONVPN_ONLY ---
Loaded dataset with shape: (7645, 208)
Found 128 Alpha'' (α'') features.
Found 40 Delta (δ) features.
Found 36 Gamma' (γ') features.

--- STARTING ALL EXPERIMENTS FOR TARGET: category ---

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') only' (128 features)
Training complete in 1.71s.
Accuracy: 76.39%

--- Running Task ---
  Target: 'category'
  Features: 'Delta (δ) only' (40 features)
Training complete in 1.64s.
Accuracy: 71.22%

--- Running Task ---
  Target: 'category'
  Features: 'Gamma' (γ') only' (36 features)
Training complete in 1.95s.
Accuracy: 70.44%

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') + Delta (δ)' (168 features)
Training complete in 2.04s.
Accuracy: 78.81%

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') + Gamma' (γ')' (164 features)
Training complete in 1.

In [None]:
# --- PESV v3 Final Classifier Comparison Script (FIXED) ---
#
# This script takes our "champion" feature set from the
# v3 ablation study (the full 'α'' + δ + γ' model)
# and runs it against four different powerful classifiers.
#
# v2 FIX: This version corrects a TypeError with MLPClassifier.
# MLP does not accept 'sample_weight'. We will instead create a
# special pipeline for it using SMOTE to handle imbalance.

print("--- Initializing PESV v3 Final Classifier Comparison (FIXED) ---")

import pandas as pd
import numpy as np
import time
import os
import warnings

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

# Imbalanced-learn imports (for SMOTE)
try:
    from imblearn.pipeline import Pipeline as ImbPipeline
    from imblearn.over_sampling import SMOTE
    IMBLEARN_AVAILABLE = True
    print("Imbalanced-learn library found. SMOTE is available.")
except ImportError:
    IMBLEARN_AVAILABLE = False
    print("WARNING: Imbalanced-learn library not found. `pip install imbalanced-learn`")
    print("MLP Classifier will run without imbalance handling.")

# XGBoost import
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("XGBoost library found.")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("WARNING: XGBoost library not found. `pip install xgboost`")
    print("XGBoost classifier will be skipped.")


print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- !! SET YOUR EXPERIMENT MODE !! ---
# "FULL"       - Runs on the full v3 dataset
# "VPN_ONLY"   - Runs on only the VPN samples
# "NONVPN_ONLY" - Runs on only the NonVPN samples
EXPERIMENT_MODE = "FULL"

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "final_PESV_dataset_v3.csv")

TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Define Classifiers ---
# We define our four contender models here

classifiers = {
    "RandomForest": RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        class_weight="balanced", # Native support
        n_jobs=-1
    ),

    "SVM (RBF Kernel)": SVC(
        kernel='rbf',
        random_state=RANDOM_STATE,
        class_weight='balanced' # Native support
    )
}

if XGBOOST_AVAILABLE:
    classifiers["XGBoost"] = xgb.XGBClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

if IMBLEARN_AVAILABLE:
    classifiers["MLP (Neural Net)"] = MLPClassifier(
        hidden_layer_sizes=(100, 50), # A simple 2-layer network
        random_state=RANDOM_STATE,
        max_iter=300,
        early_stopping=True,
        n_iter_no_change=10
    )

# --- PART 3: Load Data and Define Feature Set ---

def load_data(experiment_mode):
    """Loads and filters the dataset and selects the ONE best feature set."""
    print(f"\n--- Loading Data for Mode: {experiment_mode} ---")
    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        return None, None

    df_full = pd.read_csv(FINAL_PESV_FILE)

    if experiment_mode == "FULL":
        df = df_full
    elif experiment_mode == "VPN_ONLY":
        df = df_full[df_full['binary_type'] == 'VPN'].copy()
    elif experiment_mode == "NONVPN_ONLY":
        df = df_full[df_full['binary_type'] == 'NonVPN'].copy()
    else:
        print(f"FATAL ERROR: Unknown experiment mode '{experiment_mode}'")
        return None, None

    if df.empty:
        print("FATAL ERROR: The filtered dataset is empty.")
        return None, None

    print(f"Loaded dataset with shape: {df.shape}")

    # --- Define Feature Column Groups (FIXED LOGIC) ---
    all_cols = set(df.columns)
    label_cols = {'filename', 'application', 'category', 'binary_type'}

    alpha_pp_cols = sorted(list([c for c in all_cols if c.startswith('alpha_pp_')]))

    delta_cols_set = set([c for c in all_cols if
                          c.startswith('c2s_') or c.startswith('s2c_') or
                          c.startswith('flow_') or c.startswith('total_')])
    delta_cols = sorted(list(delta_cols_set))

    gamma_p_cols_set = set([c for c in all_cols if c.startswith('burst_')])
    gamma_p_cols = sorted(list(gamma_p_cols_set))

    # --- THIS IS THE KEY ---
    # We are ONLY selecting the "Full (α'' + δ + γ')" feature set
    champion_feature_set = alpha_pp_cols + delta_cols + gamma_p_cols

    print(f"Found {len(champion_feature_set)} features for the champion model.")

    return df, champion_feature_set

# --- PART 4: Classification Task Function ---

def run_classification_task(df, target_label, feature_cols, classifier_name, classifier_model):
    """
    Runs a single classification pipeline for a given task and classifier.
    """
    print(f"\n--- Running Task ---")
    print(f"  Target: '{target_label}'")
    print(f"  Classifier: '{classifier_name}'")

    # --- 1. Prepare Data (X, y) ---
    X = df[feature_cols]
    y = df[target_label]

    class_labels = sorted(y.unique())

    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y
    )

    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)

    # --- 4. Train the Model (with custom logic per classifier) ---
    print("Training model...")
    start_time = time.time()

    pipeline = None
    y_pred = None

    try:
        if classifier_name in ["RandomForest", "SVM (RBF Kernel)"]:
            # These models have native 'class_weight="balanced"'
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', classifier_model)
            ])
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)

        elif classifier_name == "XGBoost":
            # XGBoost needs encoded labels and 'sample_weight' in fit
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', classifier_model)
            ])
            sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)
            pipeline.fit(X_train, y_train_encoded, classifier__sample_weight=sample_weights)

            y_pred_encoded = pipeline.predict(X_test)
            y_pred = le.inverse_transform(y_pred_encoded) # Decode predictions

        elif classifier_name == "MLP (Neural Net)" and IMBLEARN_AVAILABLE:
            # --- FIX: MLP uses SMOTE for imbalance ---
            # We create a special pipeline from imblearn
            pipeline = ImbPipeline([
                ('scaler', StandardScaler()),
                ('smote', SMOTE(random_state=RANDOM_STATE, n_jobs=-1)),
                ('classifier', classifier_model)
            ])
            # We fit on the *original* string labels. SMOTE handles it.
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)

        else:
            print(f"Skipping {classifier_name} due to missing library.")
            return None

    except Exception as e:
        print(f"!!!!!!!!!! ERROR during training {classifier_name} !!!!!!!!!!")
        print(e)
        return None

    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f}s.")

    # --- 5. Make Predictions & Evaluate ---
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Optional: Uncomment to see the full detailed report for each model
    # print("\nClassification Report:")
    # print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    return accuracy

# --- PART 5: Main Execution ---
def main():
    warnings.filterwarnings("ignore", category=UserWarning)

    df, champion_features = load_data(EXPERIMENT_MODE)
    if df is None or not champion_features:
        print("Failed to load data or features. Exiting.")
        return

    if EXPERIMENT_MODE == "FULL":
        tasks_to_run = ['binary_type', 'category', 'application']
    else:
        tasks_to_run = ['category', 'application']

    summary_results = {}

    for task in tasks_to_run:
        print(f"\n{'='*70}")
        print(f"--- STARTING CLASSIFIER COMPARISON FOR TARGET: {task} ---")
        print(f"{'='*70}")

        task_results = {}
        for clf_name, clf_model in classifiers.items():
            acc = run_classification_task(df, task, champion_features, clf_name, clf_model)
            if acc is not None:
                task_results[clf_name] = acc

        summary_results[task] = task_results

    # --- Final Summary Report ---
    print(f"\n{'='*70}")
    print(f"--- FINAL CLASSIFIER COMPARISON SUMMARY ({EXPERIMENT_MODE} Dataset) ---")
    print("--- Features: Full (α'' + δ + γ') ---")
    print(f"{'='*70}\n")

    for task, results in summary_results.items():
        print(f"--- Target: {task} ---")
        sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        for clf_name, acc in sorted_results:
            print(f"  {clf_name:<20}: {acc * 100:.2f}% Accuracy")
        print("")

    print("--- Classifier Comparison Finished ---")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
    else:
        main()

--- Initializing PESV v3 Final Classifier Comparison (FIXED) ---
Imbalanced-learn library found. SMOTE is available.
XGBoost library found.
All libraries imported successfully.

--- Loading Data for Mode: FULL ---
Loaded dataset with shape: (9542, 208)
Found 204 features for the champion model.

--- STARTING CLASSIFIER COMPARISON FOR TARGET: binary_type ---

--- Running Task ---
  Target: 'binary_type'
  Classifier: 'RandomForest'
Training model...
Training complete in 4.93s.
Accuracy: 97.01%

--- Running Task ---
  Target: 'binary_type'
  Classifier: 'SVM (RBF Kernel)'
Training model...
Training complete in 2.61s.
Accuracy: 93.82%

--- Running Task ---
  Target: 'binary_type'
  Classifier: 'XGBoost'
Training model...
Training complete in 1.85s.
Accuracy: 97.12%

--- Running Task ---
  Target: 'binary_type'
  Classifier: 'MLP (Neural Net)'
Training model...
!!!!!!!!!! ERROR during training MLP (Neural Net) !!!!!!!!!!
SMOTE.__init__() got an unexpected keyword argument 'n_jobs'

--- STA

In [1]:
# --- PESV v3 "Championship" Ablation Study (FIXED) ---
#
# This script loads the final 'final_PESV_dataset_v3.csv'
# and runs a "championship" ablation study.
#
# v2 FIX: The feature-finding logic has been corrected to
# match the actual column names from the assembly script.
# - alpha'' features start with 'alpha_pp_'
# - delta features start with 'c2s_', 's2c_', 'flow_', or 'total_'
# - gamma' features start with 'burst_'

print("--- Initializing PESV v3 Championship Ablation Study (FIXED) ---")

import pandas as pd
import numpy as np
import time
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- !! SET YOUR EXPERIMENT MODE !! ---
# "FULL"       - Runs on the full v3 dataset
# "VPN_ONLY"   - Runs on only the VPN samples
# "NONVPN_ONLY" - Runs on only the NonVPN samples
EXPERIMENT_MODE = "VPN_ONLY"

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "VPNOnly-final_PESV_dataset_v3.csv")

# Use RandomForest as the standard classifier for this study
CLASSIFIER_MODEL = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced", # Use this for simplicity and speed
    n_jobs=-1
)

TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Load Data and Define Feature Sets ---

def load_data(experiment_mode):
    """Loads and filters the dataset based on the experiment mode."""
    print(f"\n--- Loading Data for Mode: {experiment_mode} ---")
    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        return None, None

    df_full = pd.read_csv(FINAL_PESV_FILE)

    if experiment_mode == "FULL":
        df = df_full
    elif experiment_mode == "VPN_ONLY":
        df = df_full[df_full['binary_type'] == 'VPN'].copy()
    elif experiment_mode == "NONVPN_ONLY":
        df = df_full[df_full['binary_type'] == 'NonVPN'].copy()
    else:
        print(f"FATAL ERROR: Unknown experiment mode '{experiment_mode}'")
        return None, None

    if df.empty:
        print("FATAL ERROR: The filtered dataset is empty.")
        return None, None

    print(f"Loaded dataset with shape: {df.shape}")

    # --- Define Feature Column Groups (FIXED LOGIC) ---
    all_cols = set(df.columns)
    label_cols = {'filename', 'application', 'category', 'binary_type'}

    # FIX: Use actual column name patterns, not assumed prefixes

    # 1. Find Alpha'' (α'') columns
    alpha_pp_cols = sorted(list([c for c in all_cols if c.startswith('alpha_pp_')]))

    # 2. Find Delta (δ) columns
    delta_cols_set = set([c for c in all_cols if
                          c.startswith('c2s_') or
                          c.startswith('s2c_') or
                          c.startswith('flow_') or
                          c.startswith('total_')])
    delta_cols = sorted(list(delta_cols_set))

    # 3. Find Gamma' (γ') columns
    gamma_p_cols_set = set([c for c in all_cols if c.startswith('burst_')])
    gamma_p_cols = sorted(list(gamma_p_cols_set))

    # This is our master list of feature sets to test
    feature_sets = {
        "Alpha'' (α'') only": alpha_pp_cols,
        "Delta (δ) only": delta_cols,
        "Gamma' (γ') only": gamma_p_cols,

        "Alpha'' (α'') + Delta (δ)": alpha_pp_cols + delta_cols,
        "Alpha'' (α'') + Gamma' (γ')": alpha_pp_cols + gamma_p_cols,
        "Delta (δ) + Gamma' (γ')": delta_cols + gamma_p_cols,

        "Full (α'' + δ + γ')": alpha_pp_cols + delta_cols + gamma_p_cols,
    }

    # --- Sanity Check ---
    print(f"Found {len(alpha_pp_cols)} Alpha'' (α'') features.")
    print(f"Found {len(delta_cols)} Delta (δ) features.")
    print(f"Found {len(gamma_p_cols)} Gamma' (γ') features.")

    if not alpha_pp_cols or not delta_cols or not gamma_p_cols:
        print("FATAL ERROR: Could not find all feature columns.")
        print("Please check your v3 assembly script and column names.")
        # We return None to trigger the TypeError correctly
        return None, None

    return df, feature_sets

# --- PART 3: Classification Task Function ---

def run_classification_task(df, target_label, feature_set_name, feature_cols):
    """
    Runs a single classification pipeline for a given task and feature set.
    """
    print(f"\n--- Running Task ---")
    print(f"  Target: '{target_label}'")
    print(f"  Features: '{feature_set_name}' ({len(feature_cols)} features)")

    # --- 1. Prepare Data (X, y) ---
    X = df[feature_cols]
    y = df[target_label]

    class_labels = sorted(y.unique())

    # Handle NaN/Inf values (just in case)
    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y  # Stratify is always critical
    )

    # --- 3. Create Scikit-learn Pipeline ---
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CLASSIFIER_MODEL)
    ])

    # --- 4. Train the Model ---
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f}s.")

    # --- 5. Make Predictions & Evaluate ---
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Optional: Uncomment for full reports during execution
    # print("\nClassification Report:")
    # print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    return accuracy

# --- PART 4: Main Execution ---
def main():
    warnings.filterwarnings("ignore", category=UserWarning)

    df, feature_sets = load_data(EXPERIMENT_MODE)
    if df is None:
        # This will cause the TypeError you saw, which is correct
        # because the load_data function failed and returned None.
        return

    # --- Define all tasks to run ---
    # We skip 'binary_type' if we are in a filtered mode
    if EXPERIMENT_MODE == "FULL":
        tasks_to_run = ['binary_type', 'category', 'application']
    else:
        tasks_to_run = ['category', 'application']

    # This will store our final summary table
    summary_results = {}

    for task in tasks_to_run:
        print(f"\n{'='*70}")
        print(f"--- STARTING ALL EXPERIMENTS FOR TARGET: {task} ---")
        print(f"{'='*70}")

        task_results = {}
        for set_name, cols in feature_sets.items():
            if not cols:
                print(f"Skipping '{set_name}': No features found.")
                continue

            acc = run_classification_task(df, task, set_name, cols)
            task_results[set_name] = acc

        summary_results[task] = task_results

    # --- Final Summary Report ---
    # --- THIS IS THE FIX ---
    # The previous line was corrupted by an injected system message.
    print(f"\n{'='*70}")
    # --- END FIX ---
    print(f"--- FINAL ABLATION STUDY SUMMARY ({EXPERIMENT_MODE} Dataset) ---")
    print(f"{'='*70}\n")

    for task, results in summary_results.items():
        print(f"--- Target: {task} ---")
        # Sort results from best to worst
        sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
        for set_name, acc in sorted_results:
            print(f"  {set_name:<30}: {acc * 100:.2f}% Accuracy")
        print("") # Newline for readability

    print("--- Ablation Study v3 Finished ---")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
    else:
        main()

--- Initializing PESV v3 Championship Ablation Study (FIXED) ---
All libraries imported successfully.

--- Loading Data for Mode: VPN_ONLY ---
Loaded dataset with shape: (2623, 208)
Found 128 Alpha'' (α'') features.
Found 40 Delta (δ) features.
Found 36 Gamma' (γ') features.

--- STARTING ALL EXPERIMENTS FOR TARGET: category ---

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') only' (128 features)
Training complete in 1.81s.
Accuracy: 90.48%

--- Running Task ---
  Target: 'category'
  Features: 'Delta (δ) only' (40 features)
Training complete in 1.21s.
Accuracy: 93.14%

--- Running Task ---
  Target: 'category'
  Features: 'Gamma' (γ') only' (36 features)
Training complete in 1.17s.
Accuracy: 90.29%

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') + Delta (δ)' (168 features)
Training complete in 1.69s.
Accuracy: 93.33%

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') + Gamma' (γ')' (164 features)
Training complete in 1.81s

In [2]:
# --- PESV v3 "Championship" Ablation Study (FIXED) ---
#
# This script loads the final 'final_PESV_dataset_v3.csv'
# and runs a "championship" ablation study.
#
# v2 FIX: The feature-finding logic has been corrected to
# match the actual column names from the assembly script.
# - alpha'' features start with 'alpha_pp_'
# - delta features start with 'c2s_', 's2c_', 'flow_', or 'total_'
# - gamma' features start with 'burst_'
#
# v3 MODIFICATION (by Gemini):
# - Added Precision, Recall, and F1-Score (weighted) to outputs.
# - Added Confusion Matrix to per-run output.
# - Updated final summary to display all metrics in a table.

print("--- Initializing PESV v3 Championship Ablation Study (FIXED) ---")

import pandas as pd
import numpy as np
import time
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- !! SET YOUR EXPERIMENT MODE !! ---
# "FULL"        - Runs on the full v3 dataset
# "VPN_ONLY"    - Runs on only the VPN samples
# "NONVPN_ONLY" - Runs on only the NonVPN samples
EXPERIMENT_MODE = "VPN_ONLY"

BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "VPNOnly-final_PESV_dataset_v3.csv")

# Use RandomForest as the standard classifier for this study
CLASSIFIER_MODEL = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced", # Use this for simplicity and speed
    n_jobs=-1
)

TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Load Data and Define Feature Sets ---

def load_data(experiment_mode):
    """Loads and filters the dataset based on the experiment mode."""
    print(f"\n--- Loading Data for Mode: {experiment_mode} ---")
    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        return None, None

    df_full = pd.read_csv(FINAL_PESV_FILE)

    if experiment_mode == "FULL":
        df = df_full
    elif experiment_mode == "VPN_ONLY":
        df = df_full[df_full['binary_type'] == 'VPN'].copy()
    elif experiment_mode == "NONVPN_ONLY":
        df = df_full[df_full['binary_type'] == 'NonVPN'].copy()
    else:
        print(f"FATAL ERROR: Unknown experiment mode '{experiment_mode}'")
        return None, None

    if df.empty:
        print("FATAL ERROR: The filtered dataset is empty.")
        return None, None

    print(f"Loaded dataset with shape: {df.shape}")

    # --- Define Feature Column Groups (FIXED LOGIC) ---
    all_cols = set(df.columns)
    label_cols = {'filename', 'application', 'category', 'binary_type'}

    # FIX: Use actual column name patterns, not assumed prefixes

    # 1. Find Alpha'' (α'') columns
    alpha_pp_cols = sorted(list([c for c in all_cols if c.startswith('alpha_pp_')]))

    # 2. Find Delta (δ) columns
    delta_cols_set = set([c for c in all_cols if
                          c.startswith('c2s_') or
                          c.startswith('s2c_') or
                          c.startswith('flow_') or
                          c.startswith('total_')])
    delta_cols = sorted(list(delta_cols_set))

    # 3. Find Gamma' (γ') columns
    gamma_p_cols_set = set([c for c in all_cols if c.startswith('burst_')])
    gamma_p_cols = sorted(list(gamma_p_cols_set))

    # This is our master list of feature sets to test
    feature_sets = {
        "Alpha'' (α'') only": alpha_pp_cols,
        "Delta (δ) only": delta_cols,
        "Gamma' (γ') only": gamma_p_cols,

        "Alpha'' (α'') + Delta (δ)": alpha_pp_cols + delta_cols,
        "Alpha'' (α'') + Gamma' (γ')": alpha_pp_cols + gamma_p_cols,
        "Delta (δ) + Gamma' (γ')": delta_cols + gamma_p_cols,

        "Full (α'' + δ + γ')": alpha_pp_cols + delta_cols + gamma_p_cols,
    }

    # --- Sanity Check ---
    print(f"Found {len(alpha_pp_cols)} Alpha'' (α'') features.")
    print(f"Found {len(delta_cols)} Delta (δ) features.")
    print(f"Found {len(gamma_p_cols)} Gamma' (γ') features.")

    if not alpha_pp_cols or not delta_cols or not gamma_p_cols:
        print("FATAL ERROR: Could not find all feature columns.")
        print("Please check your v3 assembly script and column names.")
        # We return None to trigger the TypeError correctly
        return None, None

    return df, feature_sets

# --- PART 3: Classification Task Function ---

def run_classification_task(df, target_label, feature_set_name, feature_cols):
    """
    Runs a single classification pipeline for a given task and feature set.
    """
    print(f"\n--- Running Task ---")
    print(f"  Target: '{target_label}'")
    print(f"  Features: '{feature_set_name}' ({len(feature_cols)} features)")

    # --- 1. Prepare Data (X, y) ---
    X = df[feature_cols]
    y = df[target_label]

    class_labels = sorted(y.unique())
    n_classes = len(class_labels)

    # Handle NaN/Inf values (just in case)
    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y  # Stratify is always critical
    )

    # --- 3. Create Scikit-learn Pipeline ---
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CLASSIFIER_MODEL)
    ])

    # --- 4. Train the Model ---
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f}s.")

    # --- 5. Make Predictions & Evaluate ---
    y_pred = pipeline.predict(X_test)

    # --- MODIFICATION START: Calculate all metrics ---
    accuracy = accuracy_score(y_test, y_pred)

    # Use classification_report for precision, recall, f1
    # Set zero_division=0 to handle cases where a class has no predictions
    # Get output as a dictionary for easier parsing
    report_dict = classification_report(
        y_test,
        y_pred,
        target_names=class_labels,
        zero_division=0,
        output_dict=True
    )

    # We will use the 'weighted avg' for a balanced view
    precision = report_dict['weighted avg']['precision']
    recall = report_dict['weighted avg']['recall']
    f1_score = report_dict['weighted avg']['f1-score']

    # Calculate Confusion Matrix
    # Using 'labels=class_labels' ensures the CM order is consistent
    cm = confusion_matrix(y_test, y_pred, labels=class_labels)

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision (Weighted): {precision * 100:.2f}%")
    print(f"Recall (Weighted): {recall * 100:.2f}%")
    print(f"F1-Score (Weighted): {f1_score * 100:.2f}%")

    print("\nConfusion Matrix:")
    print(f"(Rows: True Labels, Cols: Predicted Labels)")
    print(f"Labels: {class_labels}")
    print(cm)

    # Optional: Uncomment for full reports during execution
    # print("\nClassification Report (Full):")
    # print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    # Return a dictionary of all metrics for the final summary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

    return metrics
    # --- MODIFICATION END ---

# --- PART 4: Main Execution ---
def main():
    warnings.filterwarnings("ignore", category=UserWarning)

    df, feature_sets = load_data(EXPERIMENT_MODE)
    if df is None:
        # This will cause the TypeError you saw, which is correct
        # because the load_data function failed and returned None.
        return

    # --- Define all tasks to run ---
    # We skip 'binary_type' if we are in a filtered mode
    if EXPERIMENT_MODE == "FULL":
        tasks_to_run = ['binary_type', 'category', 'application']
    else:
        tasks_to_run = ['category', 'application']

    # This will store our final summary table
    summary_results = {}

    for task in tasks_to_run:
        print(f"\n{'='*70}")
        print(f"--- STARTING ALL EXPERIMENTS FOR TARGET: {task} ---")
        print(f"{'='*70}")

        task_results = {}
        for set_name, cols in feature_sets.items():
            if not cols:
                print(f"Skipping '{set_name}': No features found.")
                continue

            # --- MODIFICATION START: Store the returned metrics dict ---
            metrics = run_classification_task(df, task, set_name, cols)
            task_results[set_name] = metrics
            # --- MODIFICATION END ---

        summary_results[task] = task_results

    # --- Final Summary Report ---
    print(f"\n{'='*70}")
    print(f"--- FINAL ABLATION STUDY SUMMARY ({EXPERIMENT_MODE} Dataset) ---")
    print(f"{'='*70}\n")

    for task, results in summary_results.items():
        print(f"--- Target: {task} ---")

        # --- MODIFICATION START: Update summary table to show all metrics ---

        # Define header
        header = f"  {'Feature Set':<30} | {'Accuracy':<15} | {'Precision (W)':<15} | {'Recall (W)':<15} | {'F1-Score (W)':<15}"
        print(header)
        print(f"  {'-'*30:<30} | {'-'*15:<15} | {'-'*15:<15} | {'-'*15:<15} | {'-'*15:<15}")

        # Sort results from best to worst based on 'accuracy'
        sorted_results = sorted(
            results.items(),
            key=lambda item: item[1]['accuracy'],  # item[1] is the metrics dict
            reverse=True
        )

        for set_name, metrics in sorted_results:
            acc_str = f"{metrics['accuracy'] * 100:.2f}%"
            pre_str = f"{metrics['precision'] * 100:.2f}%"
            rec_str = f"{metrics['recall'] * 100:.2f}%"
            f1_str = f"{metrics['f1_score'] * 100:.2f}%"

            print(f"  {set_name:<30} | {acc_str:<15} | {pre_str:<15} | {rec_str:<15} | {f1_str:<15}")

        # --- MODIFICATION END ---

        print("") # Newline for readability

    print("--- Ablation Study v3 Finished ---")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
    else:
        main()

--- Initializing PESV v3 Championship Ablation Study (FIXED) ---
All libraries imported successfully.

--- Loading Data for Mode: VPN_ONLY ---
Loaded dataset with shape: (2623, 208)
Found 128 Alpha'' (α'') features.
Found 40 Delta (δ) features.
Found 36 Gamma' (γ') features.

--- STARTING ALL EXPERIMENTS FOR TARGET: category ---

--- Running Task ---
  Target: 'category'
  Features: 'Alpha'' (α'') only' (128 features)
Training complete in 1.35s.
Accuracy: 90.48%
Precision (Weighted): 90.41%
Recall (Weighted): 90.48%
F1-Score (Weighted): 90.36%

Confusion Matrix:
(Rows: True Labels, Cols: Predicted Labels)
Labels: ['Chat', 'Email', 'File Transfer', 'P2P', 'Streaming', 'VoIP']
[[ 20   0   3   0   0   9]
 [  2  23   0   0   0   1]
 [  1   0 150   0   1  11]
 [  0   0   0  71   0   1]
 [  0   0   0   0  50   2]
 [  2   1  14   0   2 161]]

--- Running Task ---
  Target: 'category'
  Features: 'Delta (δ) only' (40 features)
Training complete in 0.92s.
Accuracy: 93.14%
Precision (Weighted): 