In [6]:
# ============================================
# PIPELINE TRAINING FINAL + VISUAL + FEATURE IMPORTANCE
# (VERSI DIKEMBANGKAN)
#
# Tahap 1 : Baseline model tree (kode awalmu, dipertahankan)
# Tahap 2 : Pengembangan Random Forest
#           - Pipeline: StandardScaler + SMOTE + RF
#           - RandomizedSearchCV (tuning hyperparameter)
#           - Evaluasi F1-macro + Confusion Matrix
#           - Feature importance model terbaik
# ============================================

import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier
)

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # pipeline yang support SMOTE

from scipy.stats import randint

# ============================================================
# 1. LOAD DATASET BERLABEL (MENTAH, BELUM DINORMALISASI)
# ============================================================
df = pd.read_excel("dataset_labeled_wsn.xlsx")

print("=== 5 Baris Pertama Dataset Berlabel ===")
print(df.head())
print("\nKolom yang tersedia:", df.columns.tolist())

# Fitur = semua kecuali 'label'
X = df.drop(columns=["label"])
y = df["label"]

print("\nDistribusi label (seluruh dataset):")
print(Counter(y))


# ============================================================
# 2. TRAINâ€“TEST SPLIT (STRATIFIED)
#    - Stratified supaya proporsi tiap kelas di train & test mirip
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y     # jaga proporsi label
)

print("\nDistribusi label di TRAIN (sebelum SMOTE):")
print(Counter(y_train))

print("\nDistribusi label di TEST:")
print(Counter(y_test))


# ============================================================
# 3. NORMALISASI FITUR (TANPA DATA LEAKAGE)
#    - StandardScaler di-fit hanya pada data train
#    - Data test hanya di-transform pakai scaler yang sama
# ============================================================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("\nContoh X_train_scaled (5 baris):")
print(X_train_scaled[:5])


# ============================================================
# 4. SMOTE DI DATA TRAINING SAJA (BASELINE)
#    - Di sini kita buat versi "balanced" untuk baseline
#    - Nanti untuk tuning RF, SMOTE akan dibungkus dalam pipeline sendiri
# ============================================================
print("\n=== SMOTE: balancing data training (untuk baseline) ===")
print("Sebelum SMOTE:", Counter(y_train))

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

print("Sesudah SMOTE:", Counter(y_train_bal))


# ============================================================
# 5. DEFINISI MODEL TREE-BASED (BASELINE)
# ============================================================
models = {
    "Decision Tree": DecisionTreeClassifier(
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    ),
    "Extra Trees": ExtraTreesClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    )
}

results = {}

# urutan label konsisten untuk confusion matrix & report
label_order = sorted(y.unique())

def plot_confusion(cm, labels, title):
    plt.figure(figsize=(7,5))
    sns.heatmap(cm, annot=True, fmt="d",
                xticklabels=labels,
                yticklabels=labels,
                cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.tight_layout()
    plt.show()

def plot_f1_bar(report_dict, title):
    class_names = []
    f1_scores = []
    for k, v in report_dict.items():
        # lewati ringkasan macro/weighted/accuracy
        if k in ["accuracy", "macro avg", "weighted avg"]:
            continue
        class_names.append(k)
        f1_scores.append(v["f1-score"])

    plt.figure(figsize=(8,5))
    bars = plt.bar(class_names, f1_scores, color="mediumseagreen")
    plt.ylim(0,1.05)
    plt.title(title)
    plt.ylabel("F1-score")
    plt.xlabel("Kelas")
    plt.xticks(rotation=30, ha="right")

    for bar, score in zip(bars, f1_scores):
        plt.text(bar.get_x() + bar.get_width()/2,
                 bar.get_height() + 0.02,
                 f"{score:.2f}",
                 ha="center", va="bottom", fontsize=9)

    plt.tight_layout()
    plt.show()


# ============================================================
# 6. TRAINING & EVALUASI PER MODEL (BASELINE)
#    - Train di data train yang sudah di-SMOTE (X_train_bal)
#    - Evaluasi di data test asli (X_test_scaled)
# ============================================================
for name, model in models.items():
    print(f"\n================= {name} (BASELINE) =================")
    
    # Train pakai TRAIN balanced (SMOTE)
    model.fit(X_train_bal, y_train_bal)

    # Prediksi di TEST (asli)
    y_pred = model.predict(X_test_scaled)

    # Akurasi
    acc = accuracy_score(y_test, y_pred)
    print(f"Akurasi: {acc:.4f}")

    # Classification report untuk metrik & plotting
    report_dict = classification_report(
        y_test, y_pred,
        target_names=label_order,
        output_dict=True
    )
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_order))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=label_order)
    print("Confusion Matrix (raw):")
    print(cm)

    # Simpan metrik ringkas
    results[name] = {
        "model": model,
        "accuracy": acc,
        "cm": cm,
        "report": report_dict,
        "f1_macro": report_dict["macro avg"]["f1-score"],
        "precision_macro": report_dict["macro avg"]["precision"],
        "recall_macro": report_dict["macro avg"]["recall"],
    }

    # ðŸ”¹ Visual 1: Confusion Matrix berwarna
    plot_confusion(cm, labels=label_order, title=f"Confusion Matrix - {name}")

    # ðŸ”¹ Visual 2: Bar chart F1-score per kelas
    plot_f1_bar(report_dict, title=f"F1-score per Kelas - {name}")


# ============================================================
# 7. RINGKASAN PERBANDINGAN MODEL (BASELINE)
# ============================================================
summary_rows = []
for name, info in results.items():
    summary_rows.append({
        "Model": name,
        "Accuracy": info["accuracy"],
        "F1_macro": info["f1_macro"],
        "Precision_macro": info["precision_macro"],
        "Recall_macro": info["recall_macro"],
    })

summary_df = pd.DataFrame(summary_rows)
print("\n=== RINGKASAN METRIK MODEL (BASELINE) ===")
print(summary_df)

# ðŸ”¹ Plot perbandingan Akurasi
plt.figure(figsize=(8,5))
plt.bar(summary_df["Model"], summary_df["Accuracy"], color="cornflowerblue")
plt.ylim(0, 1.05)
plt.title("Perbandingan Akurasi Antar Model (Baseline)")
plt.ylabel("Accuracy")
plt.xticks(rotation=20, ha="right")
for i, v in enumerate(summary_df["Accuracy"]):
    plt.text(i, v + 0.01, f"{v:.3f}", ha="center", fontsize=9)
plt.tight_layout()
plt.show()

# ðŸ”¹ Plot perbandingan F1 Macro
plt.figure(figsize=(8,5))
plt.bar(summary_df["Model"], summary_df["F1_macro"], color="mediumseagreen")
plt.ylim(0, 1.05)
plt.title("Perbandingan F1-Score Macro Antar Model (Baseline)")
plt.ylabel("F1-score (Macro)")
plt.xticks(rotation=20, ha="right")
for i, v in enumerate(summary_df["F1_macro"]):
    plt.text(i, v + 0.01, f"{v:.3f}", ha="center", fontsize=9)
plt.tight_layout()
plt.show()


# ============================================================
# 8. PILIH MODEL TERBAIK (BERDASARKAN AKURASI BASELINE)
# ============================================================
best_model_name = max(results, key=lambda m: results[m]["accuracy"])
best_model = results[best_model_name]["model"]

print(f"\n=== MODEL TERBAIK SEMENTARA (BASELINE): {best_model_name} ===")
print(f"Akurasi: {results[best_model_name]['accuracy']:.4f}")


# ============================================================
# 9. FEATURE IMPORTANCE RANDOM FOREST (BASELINE)
# ============================================================
if "Random Forest" in results:
    rf_model_baseline = results["Random Forest"]["model"]
    feature_names = X.columns
    importances = rf_model_baseline.feature_importances_

    fi_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    }).sort_values("importance", ascending=False)

    print("\n=== FEATURE IMPORTANCE - RANDOM FOREST (BASELINE) ===")
    print(fi_df)

    # Visual
    plt.figure(figsize=(8,5))
    plt.barh(fi_df["feature"], fi_df["importance"], color="salmon")
    plt.xlabel("Importance")
    plt.title("Feature Importance - Random Forest (Baseline)")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("\nRandom Forest tidak ditemukan di results, tidak bisa hitung feature importance.")


# =================================================================
# 10. PENGEMBANGAN MODEL: PIPELINE + HYPERPARAMETER TUNING RANDOM FOREST
# =================================================================
print("\n\n================= PENGEMBANGAN MODEL RANDOM FOREST =================")

# Di tahap ini kita:
# - Tidak pakai X_train_scaled / X_train_bal langsung
# - Gunakan Pipeline: StandardScaler -> SMOTE -> RandomForest
#   sehingga SMOTE diaplikasikan di dalam cross-validation dengan benar
# - Tuning hyperparameter RF dengan RandomizedSearchCV
# - Skor utama: F1-macro (lebih adil untuk multiclass imbalanced)

rf_pipeline = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42, k_neighbors=5)),
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Ruang hyperparameter yang akan dieksplor
param_dist = {
    "rf__n_estimators": randint(150, 400),
    "rf__max_depth": [None, 5, 10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["sqrt", "log2"],
    # opsional: bisa diaktifkan untuk eksperimen:
    # "rf__class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rand_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_dist,
    n_iter=40,                 # jumlah kombinasi random
    scoring="f1_macro",        # fokus pada F1 macro, bukan cuma akurasi
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Fit di TRAIN SET asli (belum diskalakan & belum di-SMOTE),
# karena scaler & SMOTE sudah ada di dalam pipeline.
rand_search.fit(X_train, y_train)

print("\nBest params (RandomizedSearchCV):")
print(rand_search.best_params_)
print("Best CV F1-macro:", rand_search.best_score_)

best_rf_pipeline = rand_search.best_estimator_

# ============================================================
# 11. EVALUASI MODEL RF TERBAIK DI TEST SET
# ============================================================
y_pred_best = best_rf_pipeline.predict(X_test)

print("\n=== CLASSIFICATION REPORT - RF TERBAIK (TEST SET) ===")
print(classification_report(y_test, y_pred_best, target_names=label_order))

cm_best = confusion_matrix(y_test, y_pred_best, labels=label_order)
print("\nConfusion Matrix RF Terbaik (raw):")
print(cm_best)

plot_confusion(cm_best, labels=label_order, title="Confusion Matrix - Random Forest (Tuned)")
report_best_dict = classification_report(
    y_test, y_pred_best, target_names=label_order, output_dict=True
)
plot_f1_bar(report_best_dict, title="F1-score per Kelas - Random Forest (Tuned)")


# ============================================================
# 12. CROSS-VALIDATION SCORE MODEL RF TERBAIK
#     - Untuk menunjukkan stabilitas performa
# ============================================================
f1_cv_scores = cross_val_score(
    best_rf_pipeline, X_train, y_train,
    cv=cv, scoring="f1_macro", n_jobs=-1
)

print("\n=== CROSS-VALIDATION F1-MACRO (TRAIN SET) - RF TERBAIK ===")
print("Mean F1-macro:", f1_cv_scores.mean())
print("Std  F1-macro:", f1_cv_scores.std())


# ============================================================
# 13. FEATURE IMPORTANCE DARI RF TERBAIK
#     - Mengambil RandomForest di dalam pipeline
# ============================================================
rf_final = best_rf_pipeline.named_steps["rf"]
final_importances = rf_final.feature_importances_
feature_names = X.columns

fi_final_df = pd.DataFrame({
    "feature": feature_names,
    "importance": final_importances
}).sort_values("importance", ascending=False)

print("\n=== FEATURE IMPORTANCE - RANDOM FOREST (TUNED) ===")
print(fi_final_df)

plt.figure(figsize=(8,5))
plt.barh(fi_final_df["feature"], fi_final_df["importance"], color="orange")
plt.xlabel("Importance")
plt.title("Feature Importance - Random Forest (Tuned)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


# ============================================================
# 14. (OPSIONAL) SIMPAN MODEL PIPELINE TERBAIK UNTUK DIPAKAI DI WEBSITE
# ============================================================
# Dengan menyimpan seluruh pipeline (scaler + SMOTE + RF),
# nanti di backend website cukup load file ini dan panggil .predict()
# tanpa perlu scaling manual.
import joblib

joblib.dump(best_rf_pipeline, "rf_wsn_multiclass_best_pipeline.pkl")
print("\nModel pipeline Random Forest terbaik sudah disimpan ke 'rf_wsn_multiclass_best_pipeline.pkl'")


=== 5 Baris Pertama Dataset Berlabel ===
   energy_level  energy_drop_rate  latency_ms  end_to_end_delay  packet_loss  \
0     21.236204          9.753572  159.799091        219.731697     1.560186   
1     23.777467          6.668543   71.430023        230.177695     0.564116   
2     16.370173          5.909125   77.510676        160.848449     5.247564   
3     25.743240          6.999305   56.999849        294.751104     2.327713   
4     15.990213          7.571172  138.862185        109.290083     6.075449   

   throughput_kbps         label  
0        88.998630        normal  
1       230.499693        normal  
2       157.986255        normal  
3        72.651609        normal  
4        92.631031  energy_fault  

Kolom yang tersedia: ['energy_level', 'energy_drop_rate', 'latency_ms', 'end_to_end_delay', 'packet_loss', 'throughput_kbps', 'label']

Distribusi label (seluruh dataset):
Counter({'normal': 7735, 'gateway_fault': 2684, 'routing_fault': 2535, 'energy_fault': 1030, 'l

ImportError: cannot import name 'LoadFlags' from 'matplotlib.ft2font' (c:\Users\rahay\.pyenv\pyenv-win\versions\3.12.5\Lib\site-packages\matplotlib\ft2font.cp312-win_amd64.pyd)