In [None]:
`# --- PESV (α + δ + γ') Classification Script ---
# This script replaces the original .ipynb notebook.
# It loads the final "Robust Model" dataset and runs all three
# classification tasks (binary, category, application) using
# scikit-learn's best practices (Pipeline, Scaler, and Class Weighting).

print("--- Initializing PESV v2 Classification Script ---")

# --- Step 0: Ensure correct libraries are installed ---
# Please run this in a separate Colab cell before running this script:
# !pip install -U scikit-learn imbalanced-learn

import pandas as pd
import numpy as np
import time
import os
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
# Note: You can swap RandomForestClassifier for XGBClassifier if you prefer
# from xgboost import XGBClassifier

print("All libraries imported successfully.")

# --- PART 1: Configuration ---

# --- File & Path Configuration ---
BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
FINAL_PESV_FILE = os.path.join(BASE_PATH, "final_PESV_dataset_v2.csv")

# --- Model Configuration ---
# We use RandomForestClassifier as it's robust and handles class_weight simple.
# 'class_weight="balanced"' is the MOST important parameter.
# It automatically gives more weight to minority classes (like YouTube/BitTorrent)
# n_jobs=-1 uses all available CPU cores in Colab.
CLASSIFIER_MODEL = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

# You can uncomment this line to use XGBoost instead.
# Note: XGBoost does not have a simple 'class_weight' param.
# You would need to calculate 'sample_weight' manually using
# from sklearn.utils.class_weight import compute_sample_weight
# and pass it to model.fit()
# CLASSIFIER_MODEL = XGBClassifier(random_state=42, n_jobs=-1)


# --- ML Configuration ---
TEST_SET_SIZE = 0.2
RANDOM_STATE = 42

# --- PART 2: Main Classification Function ---

def run_classification_task(df, target_label):
    """
    Runs a full classification pipeline for a given target label.
    """
    print(f"\n{'='*70}")
    print(f"--- Starting Task: Classify '{target_label}' ---")
    print(f"{'='*70}")

    # --- 1. Prepare Data (X, y) ---
    label_cols = ['filename', 'application', 'category', 'binary_type']
    feature_cols = [col for col in df.columns if col not in label_cols]

    X = df[feature_cols]
    y = df[target_label]

    print(f"Features: {len(feature_cols)}, Samples: {len(X)}")

    # Get class names for the report
    class_labels = sorted(y.unique())
    print(f"Target classes ({len(class_labels)}): {class_labels}\n")

    # Handle NaN/Inf values (just in case)
    X = X.replace([np.inf, -np.inf], np.nan)
    if X.isnull().values.any():
        print("Warning: Found NaN values in features. Filling with 0.")
        X = X.fillna(0)

    # --- 2. Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y  # IMPORTANT: Ensures class distribution is the same in train/test
    )
    print(f"Data split: {len(X_train)} train samples, {len(X_test)} test samples.")

    # --- 3. Create Scikit-learn Pipeline ---
    # This is a best practice. It chains steps together.
    # Step 1: StandardScaler() -> Scales all features (e.g., mean=0, std=1)
    # Step 2: CLASSIFIER_MODEL -> Our RandomForestClassifier

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CLASSIFIER_MODEL)
    ])

    # --- 4. Train the Model ---
    print("\nTraining model... (This may take a moment)")
    start_time = time.time()

    # We train the *entire pipeline*
    pipeline.fit(X_train, y_train)

    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f} seconds.")

    # --- 5. Make Predictions & Evaluate ---
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("\n--- Evaluation Results ---")
    print(f"Accuracy: {accuracy * 100:.2f}%\n")

    print("Classification Report:")
    # 'zero_division=0' prevents warnings for classes with 0 samples
    print(classification_report(y_test, y_pred, target_names=class_labels, zero_division=0))

    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred, labels=class_labels)
    cm_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)
    print(cm_df)

    print(f"\n--- Task for '{target_label}' Finished ---")
    return accuracy

# --- PART 3: Main Execution ---
def main():
    # Suppress warnings for a cleaner report
    warnings.filterwarnings("ignore", category=UserWarning)

    if not os.path.exists(FINAL_PESV_FILE):
        print(f"FATAL ERROR: Could not find dataset at '{FINAL_PESV_FILE}'")
        print("Please check the path and ensure the assembly script ran successfully.")
        return

    print(f"Loading final dataset from: {FINAL_PESV_FILE}...")
    try:
        df_full = pd.read_csv(FINAL_PESV_FILE)
    except Exception as e:
        print(f"FATAL ERROR: Could not read CSV file. Error: {e}")
        return

    print(f"Successfully loaded dataset with shape: {df_full.shape}")

    # --- Define all tasks to run ---
    tasks_to_run = ['binary_type', 'category', 'application']
    results = {}

    for task in tasks_to_run:
        acc = run_classification_task(df_full, task)
        results[task] = acc

    # --- Final Summary Report ---
    print(f"\n{'='*70}")
    print("--- All Classification Tasks Complete ---")
    print(f"{'='*70}\n")
    print("Final Accuracy Summary:")
    for task, acc in results.items():
        print(f"  - {task:<15}: {acc * 100:.2f}% Accuracy")

if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
        print("Run this in a cell: from google.colab import drive; drive.mount('/content/drive')")
    else:
        main()

print("\n--- PESV v2 Classification Script Finished ---")


--- Initializing PESV v2 Classification Script ---
All libraries imported successfully.
Loading final dataset from: /content/drive/MyDrive/1 Skripsi/final_PESV_dataset_v2.csv...
Successfully loaded dataset with shape: (10105, 112)

--- Starting Task: Classify 'binary_type' ---
Features: 108, Samples: 10105
Target classes (2): ['NonVPN', 'VPN']

Data split: 8084 train samples, 2021 test samples.

Training model... (This may take a moment)
Training complete in 2.22 seconds.

--- Evaluation Results ---
Accuracy: 96.54%

Classification Report:
              precision    recall  f1-score   support

      NonVPN       0.98      0.98      0.98      1630
         VPN       0.92      0.90      0.91       391

    accuracy                           0.97      2021
   macro avg       0.95      0.94      0.94      2021
weighted avg       0.97      0.97      0.97      2021

Confusion Matrix:
        NonVPN  VPN
NonVPN    1598   32
VPN         38  353

--- Task for 'binary_type' Finished ---

--- Sta