In [15]:
# ==========================================================
# 🌏 K-MEANS CLUSTERING
# ==========================================================

import numpy as np
import pandas as pd
import plotly.graph_objects as go

# ---------------------------------
# 1️⃣ DATASET (38 PROVINSI INDONESIA)
# ---------------------------------
data = [
    ("ACEH", 90.94, 61.30, 76.50),
    ("SUMATERA UTARA", 90.90, 60.27, 49.80),
    ("SUMATERA BARAT", 90.53, 57.05, 67.00),
    ("SUMATERA SELATAN", 87.76, 58.16, 44.10),
    ("RIAU", 90.89, 50.84, 51.30),
    ("JAMBI", 90.57, 46.06, 51.00),
    ("BENGKULU", 92.49, 48.98, 55.70),
    ("LAMPUNG", 88.04, 55.36, 37.50),
    ("KEP. BANGKA BELITUNG", 90.15, 57.43, 39.20),
    ("KEP. RIAU", 90.13, 54.93, 66.00),
    ("DKI JAKARTA", 68.46, 40.76, 27.20),
    ("JAWA BARAT", 81.39, 46.87, 42.00),
    ("JAWA TENGAH", 86.35, 51.23, 46.30),
    ("DI YOGYAKARTA", 89.01, 40.28, 48.70),
    ("JAWA TIMUR", 84.73, 55.86, 49.70),
    ("BANTEN", 66.67, 58.93, 43.10),
    ("BALI", 88.99, 55.17, 45.20),
    ("NUSA TENGGARA BARAT", 90.21, 47.78, 66.60),
    ("NUSA TENGGARA TIMUR", 92.03, 54.65, 59.30),
    ("KALIMANTAN BARAT", 92.03, 52.97, 60.40),
    ("KALIMANTAN TENGAH", 91.47, 53.90, 75.50),
    ("KALIMANTAN SELATAN", 92.85, 55.64, 52.10),
    ("KALIMANTAN TIMUR", 89.64, 52.64, 83.60),
    ("KALIMANTAN UTARA", 93.91, 55.93, 100.00),
    ("SULAWESI UTARA", 93.52, 52.12, 62.30),
    ("SULAWESI TENGAH", 91.88, 63.63, 82.60),
    ("SULAWESI SELATAN", 90.58, 57.95, 56.20),
    ("SULAWESI TENGGARA", 92.83, 61.28, 74.90),
    ("GORONTALO", 94.43, 58.70, 80.20),
    ("SULAWESI BARAT", 93.33, 58.82, 73.00),
    ("MALUKU", 92.47, 55.87, 90.70),
    ("MALUKU UTARA", 93.19, 61.05, 86.60),
    ("PAPUA BARAT", 96.22, 59.52, 100.00),
    ("PAPUA BARAT DAYA", 96.28, 55.61, 100.00),
    ("PAPUA", 95.87, 54.73, 100.00),
    ("PAPUA SELATAN", 95.44, 60.73, 93.20),
    ("PAPUA TENGAH", 95.33, 64.67, 100.00),
    ("PAPUA PEGUNUNGAN", 97.68, 63.18, 99.40),
]

df = pd.DataFrame(data, columns=["Provinsi", "IKU", "IKA", "IKTL"])
X = df[["IKU", "IKA", "IKTL"]].values

# ---------------------------------
# 2️⃣ INISIALISASI PARAMETER K-MEANS
# ---------------------------------
k = 3
np.random.seed(42)
initial_idx = np.random.choice(len(X), size=k, replace=False)
centroids = X[initial_idx].astype(float)
max_iter = 100
tol = 1e-4

def euclidean_distance_matrix(X, centroids):
    return np.linalg.norm(X[:, np.newaxis, :] - centroids[np.newaxis, :, :], axis=2)

# ---------------------------------
# 3️⃣ PROSES ITERASI
# ---------------------------------
print("=== K-MEANS (TAHAP DEMI TAHAP) ===\n")
print("Centroid awal:")
for i, c in enumerate(centroids):
    print(f" C{i}: IKU={c[0]:.2f}, IKA={c[1]:.2f}, IKTL={c[2]:.2f}")
print("\n")

for iteration in range(1, max_iter + 1):
    print(f"===== ITERASI {iteration} =====")

    # Hitung jarak Euclidean
    distances = euclidean_distance_matrix(X, centroids)
    dist_df = pd.DataFrame(distances, columns=[f"Jarak_C{j}" for j in range(k)])
    display_table = pd.concat([df[["Provinsi", "IKU", "IKA", "IKTL"]], dist_df], axis=1)
    print("\n📏 Tabel jarak Euclidean:")
    print(display_table.to_string(index=False))

    # Penugasan cluster
    assigned = np.argmin(distances, axis=1)
    df["Cluster"] = assigned

    print("\n🧩 Pengelompokan sementara:")
    for cl in range(k):
        anggota = df[df["Cluster"] == cl]["Provinsi"].tolist()
        print(f" Cluster {cl} (n={len(anggota)}): {', '.join(anggota)}")

    # Hitung centroid baru
    new_centroids = np.zeros_like(centroids)
    for cl in range(k):
        cluster_points = X[assigned == cl]
        new_centroids[cl] = cluster_points.mean(axis=0)

    # Pergeseran centroid
    shifts = np.linalg.norm(new_centroids - centroids, axis=1)
    print("\n📈 Centroid baru dan pergeseran:")
    for i, c in enumerate(new_centroids):
        print(f" C{i}: IKU={c[0]:.2f}, IKA={c[1]:.2f}, IKTL={c[2]:.2f}  |  Shift={shifts[i]:.6f}")

    centroids = new_centroids.copy()

    # Konvergensi
    if np.all(shifts < tol):
        print("\n✅ Konvergen pada iterasi ke-", iteration)
        break
    else:
        print("\n➡️ Lanjut ke iterasi berikutnya...\n")

# ---------------------------------
# 4️⃣ HASIL AKHIR CLUSTERING
# ---------------------------------
centroid_means = centroids.mean(axis=1)
order = np.argsort(centroid_means)
label_map = {order[0]: "Buruk", order[1]: "Sedang", order[2]: "Baik"}
df["Label"] = df["Cluster"].map(label_map)

print("\n=== HASIL AKHIR CLUSTERING ===")
for i, c in enumerate(centroids):
    print(f" C{i} ({label_map[i]}): IKU={c[0]:.2f}, IKA={c[1]:.2f}, IKTL={c[2]:.2f}")

# Statistik cluster
print("\n📊 Jumlah provinsi per kategori:")
print(df["Label"].value_counts())

print("\n📋 Daftar Provinsi per Cluster:")
for label in ["Baik", "Sedang", "Buruk"]:
    cluster_data = df[df["Label"] == label]
    print(f"\n  Cluster {label} ({len(cluster_data)} provinsi):")
    for i, prov in enumerate(cluster_data["Provinsi"], 1):
        print(f"   {i}. {prov}")

# ---------------------------------
# 5️⃣ VISUALISASI 3D INTERAKTIF
# ---------------------------------
colors = {"Baik": "#10b981", "Sedang": "#f59e0b", "Buruk": "#ef4444"}
fig = go.Figure()

for label in ["Baik", "Sedang", "Buruk"]:
    subset = df[df["Label"] == label]
    fig.add_trace(go.Scatter3d(
        x=subset["IKU"], y=subset["IKA"], z=subset["IKTL"],
        mode="markers",
        name=f"{label} ({len(subset)} provinsi)",
        marker=dict(size=8, color=colors[label], opacity=0.8),
        text=subset["Provinsi"],
        hovertemplate="<b>%{text}</b><br>IKU: %{x:.2f}<br>IKA: %{y:.2f}<br>IKTL: %{z:.2f}<extra></extra>"
    ))

# Tambahkan centroid
for i, c in enumerate(centroids):
    fig.add_trace(go.Scatter3d(
        x=[c[0]], y=[c[1]], z=[c[2]],
        mode="markers",
        name=f"Centroid {label_map[i]}",
        marker=dict(size=15, color=colors[label_map[i]], symbol="diamond", line=dict(color="black", width=2))
    ))

fig.update_layout(
    title="<b>Visualisasi 3D K-Means Clustering</b><br><sub>Analisis Provinsi Indonesia Berdasarkan IKU, IKA, IKTL</sub>",
    scene=dict(
        xaxis_title="IKU (Indeks Kualitas Udara)",
        yaxis_title="IKA (Indeks Kualitas Air)",
        zaxis_title="IKTL (Indeks Kualitas Tutupan Lahan)"
    ),
    legend=dict(x=0.02, y=0.98),
    width=1400,
    height=900,
    template="plotly_white"
)

fig.show()

print("\n✅ Visualisasi 3D berhasil ditampilkan.")


=== K-MEANS (TAHAP DEMI TAHAP) ===

Centroid awal:
 C0: IKU=96.28, IKA=55.61, IKTL=100.00
 C1: IKU=95.33, IKA=64.67, IKTL=100.00
 C2: IKU=90.89, IKA=50.84, IKTL=51.30


===== ITERASI 1 =====

📏 Tabel jarak Euclidean:
            Provinsi   IKU   IKA  IKTL  Jarak_C0  Jarak_C1  Jarak_C2
                ACEH 90.94 61.30  76.5 24.761698 24.142887 27.284686
      SUMATERA UTARA 90.90 60.27  49.8 50.702071 50.586806  9.548560
      SUMATERA BARAT 90.53 57.05  67.0 33.528139 34.206789 16.887383
    SUMATERA SELATAN 87.76 58.16  44.1 56.603029 56.784637 10.734025
                RIAU 90.89 50.84  51.3 49.229006 50.820001  0.000000
               JAMBI 90.57 46.06  51.0 50.247454 52.630692  4.800083
            BENGKULU 92.49 48.98  55.7 44.953431 47.082180  5.037817
             LAMPUNG 88.04 55.36  37.5 63.041336 63.608727 14.798409
KEP. BANGKA BELITUNG 90.15 57.43  39.2 61.135336 61.448271 13.798032
           KEP. RIAU 90.13 54.93  66.0 34.558427 35.747834 15.277294
         DKI JAKARTA 68.


✅ Visualisasi 3D berhasil ditampilkan.
